美文网首页程序员设计模式
手撸golang 基本数据结构与算法 哈希表

手撸golang 基本数据结构与算法 哈希表

作者: 老罗话编程 | 来源:发表于2021-02-18 08:03 被阅读0次

    手撸golang 基本数据结构与算法 哈希表

    缘起

    最近阅读<<我的第一本算法书>>(【日】石田保辉;宫崎修一)
    本系列笔记拟采用golang练习之

    哈希表

    哈希表存储的是由键(key)和值(value)组成的数据。
    在哈希表中,我们可以利用哈希函数快速访问到数组中的目标数据。
    
    如果发生哈希冲突(哈希函数对不同的键, 返回了相同的哈希值),
    就使用链表进行存储(因此, 哈希表一般至少是数组+链表的二维结构)。
    
    如果数组的空间太小,
    使用哈希表的时候就容易发生冲突,
    线性查找的使用频率也会更高;
    反过来,如果数组的空间太大,
    就会出现很多空箱子,造成内存的浪费。
    
    摘自 <<我的第一本算法书>>(【日】石田保辉;宫崎修一)
    

    目标

    • 实现一个哈希表, 提供对键值对数据的增删改查, 且性能不能太差
    • 物理结构
      • 采用分区 + 数组 + 链表的三维结构
      • 分区是一个双向链表, 由小到大呈2次幂增长
      • 当前分区总是指向最新也是最大的那个分区
      • 查找某个键时, 需要遍历所有分区
    • 哈希函数
      • 合适的哈希函数对性能影响非常巨大
      • 对哈希函数的要求一般是足够快, 足够"散", 对不同键值最好能随机均匀分布
      • 本示例采用hash/crc64, 多项式为ECMA
      • 如果哈希函数的计算比较耗时, 则应当尽可能重复利用计算结果
    • 扩容与缩容
      • 扩容
        • 填充因子为3/4, 即当前分区的元素数>容量的3/4时, 创建2倍大的新分区
        • 扩容不做任何拷贝和rehash.
        • 扩容后, 非当前分区变成只读和只删.
      • 缩容: 当某个分区未持有任何元素, 且不为当前分区时, 删除此分区

    设计

    • IHasher: 哈希支持接口, 定义哈希计算函数, 和键值相等比较函数
    • IMap: 哈希表接口
    • IMapIterator: 哈希表迭代器接口
    • tCrc64Hasher:
      • 基于hash/crc64, 实现IHasher接口.
      • 支持多种键类型
      • 对于整型的键, 返回自身;
      • 其他类型的键, 转为%v字符串再计算crc64的哈希值.
    • tHashMap: 哈希表的实现, 使用分区(双链)+数组+链表(单链)三维结构
    • tPartition: 哈希表分区. 其大小呈2次幂
    • tBucket: 哈希桶, 每个桶内的任何元素, 哈希值是一致的
    • tHashMapIterator: 哈希表迭代器的实现

    单元测试

    hashmap_test.go, 除基本功能测试, 还测试了百万键值插入+遍历的性能

    package data_structure
    
    import (
        "fmt"
        "learning/gooop/data_structure/hashmap"
        "strconv"
        "testing"
        "time"
    )
    
    func Test_HashMap(t *testing.T) {
        fnAssertTrue := func(b bool, msg string) {
            if !b {
                t.Fatal(msg)
            }
        }
    
        fnEquals := func(a interface{}, b interface{}) bool {
            i1,b1 := a.(int)
            if b1 {
                i2,b2 := b.(int)
                if b2 {
                    return i1 == i2
                }
            }
    
            s1,b1 := a.(string)
            if b1 {
                s2,b2 := b.(string)
                if b2 {
                    return s1 == s2
                }
            }
    
            return a == b
        }
    
        hasher := hashmap.NewCrc64Hashful(fnEquals)
        hm := hashmap.NewHashMap(hasher, 2)
    
        hm.Put(1, "10")
        t.Log(hm)
        fnAssertTrue(hm.Size() == 1, "expecting size == 1")
        fnAssertTrue(hm.IsNotEmpty(), "expecting not empty")
    
        ok,v := hm.Get(1)
        fnAssertTrue(ok, "expecting ok")
        fnAssertTrue(v == "10", "expecting 10")
    
        hm.Put(2, "20")
        hm.Put(3, "30")
        hm.Put(4, "40")
        hm.Put(5, "50")
        t.Log(hm)
        fnAssertTrue(hm.Size() == 5, "expecting size == 5")
        ok,v = hm.Get(2)
        fnAssertTrue(ok == true && v == "20", "expecting true and 20")
    
        hm.Clear()
        t.Log(hm)
        fnAssertTrue(hm.Size() == 0, "expecting size == 0")
        fnAssertTrue(hm.IsEmpty(), "expecting empty")
    
        iter := hm.Iterator()
        fnAssertTrue(!iter.More(), "expecting no more")
    
        hm.Put(1, "10")
        hm.Put(2, "20")
        hm.Put(3, "30")
        t.Log(hm)
        fnAssertTrue(hm.Has(1) && hm.Has(2) && hm.Has(3) && !hm.Has(4), "expecting has 1,2,3")
    
        hm.Put(4, "40")
        hm.Put(5, "50")
        hm.Put(6, "60")
        t.Log(hm)
        iter = hm.Iterator()
        fnAssertTrue(iter.More(), "expecting more")
        e,k,v := iter.Next()
        t.Logf("%v>%s", k, v)
        fnAssertTrue(e == nil, "e == nil")
        e,k,v = iter.Next()
        t.Logf("%v>%s", k, v)
        fnAssertTrue(e == nil, "e == nil")
        e,k,v = iter.Next()
        t.Logf("%v>%s", k, v)
        fnAssertTrue(e == nil, "e == nil")
        e,k,v = iter.Next()
        t.Logf("%v>%s", k, v)
        fnAssertTrue(e == nil, "e == nil")
        e,k,v = iter.Next()
        t.Logf("%v>%s", k, v)
        fnAssertTrue(e == nil, "e == nil")
        e,k,v = iter.Next()
        t.Logf("%v>%s", k, v)
        fnAssertTrue(e == nil, "e == nil")
        e,k,v = iter.Next()
        fnAssertTrue(e != nil, "expecting e != nil")
    
        ok,v = hm.Remove(3)
        t.Log(hm)
        fnAssertTrue(ok && v == "30" && hm.Size() == 5, "expecting remove 30")
    
        ok,v = hm.Remove(2)
        t.Log(hm)
        fnAssertTrue(ok && v == "20" && hm.Size() == 4, "expecting remove 20")
    
        t0 := time.Now().UnixNano()
        hm.Clear()
        size := 1000 * 1000
        for i := 0; i < size;i++ {
            hm.Put(strconv.Itoa(i), i)
        }
        millis := (time.Now().UnixNano() - t0) / 1000000
        t.Logf("putting %v string>int = %v ms", size, millis)
        fnAssertTrue(hm.Size() == size, fmt.Sprintf("expecting %v", size))
    
        for i := 0;i < size;i++ {
            ok, v = hm.Get(strconv.Itoa(i))
            fnAssertTrue(ok == true && v == i, "expecting i")
        }
    }
    

    测试输出

    $ go test -v hashmap_test.go 
    === RUN   Test_HashMap
        hashmap_test.go:42: s=1,v=1 p[1:b[1 1]]
        hashmap_test.go:54: s=5,v=5 p[4:b[1 4],5:b[1 5]] p[1:b[1 1],2:b[1 2],3:b[1 3]]
        hashmap_test.go:60: s=0,v=6 p[]
        hashmap_test.go:70: s=3,v=9 p[1:b[1 1],2:b[1 2],3:b[1 3]]
        hashmap_test.go:76: s=6,v=12 p[1:b[1 1],2:b[1 2],3:b[1 3],4:b[1 4],5:b[1 5],6:b[1 6]]
        hashmap_test.go:80: 1>10
        hashmap_test.go:83: 2>20
        hashmap_test.go:86: 3>30
        hashmap_test.go:89: 4>40
        hashmap_test.go:92: 5>50
        hashmap_test.go:95: 6>60
        hashmap_test.go:101: s=5,v=13 p[1:b[1 1],2:b[1 2],4:b[1 4],5:b[1 5],6:b[1 6]]
        hashmap_test.go:105: s=4,v=14 p[1:b[1 1],4:b[1 4],5:b[1 5],6:b[1 6]]
        hashmap_test.go:115: putting 1000000 string>int = 1590 ms
    --- PASS: Test_HashMap (2.17s)
    PASS
    ok      command-line-arguments  2.181s
    

    IHasher.go

    哈希支持接口, 定义哈希计算函数, 和键值相等比较函数

    package hashmap
    
    type IHasher interface {
        Hash(key interface{}) uint64
        Equals(a interface{}, b interface{}) bool
    }
    

    IMap.go

    哈希表接口

    package hashmap
    
    type IMap interface {
        Size() int
        IsEmpty() bool
        IsNotEmpty() bool
    
        Put(key interface{}, value interface{})
        Get(key interface{}) (bool,interface{})
        Has(key interface{}) bool
        Remove(key interface{}) (bool, interface{})
        Clear()
    
        Iterator() IMapIterator
        String() string
    }
    

    IMapIterator.go

    哈希表迭代器接口

    package hashmap
    
    type IMapIterator interface {
        More() bool
        Next() (err error, key interface{}, value interface{})
    }
    

    tCrc64Hasher.go

    • 基于hash/crc64, 实现IHasher接口.
    • 支持多种键类型
    • 对于整型的键, 返回自身;
    • 其他类型的键, 转为%v字符串再计算crc64的哈希值.
    package hashmap
    
    import (
        "fmt"
        "hash/crc64"
    )
    
    var gCrc64Table = crc64.MakeTable(crc64.ECMA)
    
    type FnEquals func(a interface{}, b interface{}) bool
    
    type tCrc64Hasher struct {
        fnEquals FnEquals
    }
    
    const INT_MAX = int(^uint(0) >> 1)
    const INT_MIN = ^INT_MAX
    const INT32_MAX = int32(^uint32(0) >> 1)
    const INT32_MIN = ^INT32_MAX
    const INT64_MAX = int64(^uint64(0) >> 1)
    const INT64_MIN = ^INT64_MAX
    
    func (me *tCrc64Hasher) Hash(it interface{}) uint64 {
        if it == nil {
            return 0
        }
    
        if v,ok := it.(int);ok {
            return uint64(v - INT_MIN)
    
        } else if v,ok := it.(int64);ok {
            return uint64(v - INT64_MIN)
    
        } else if v,ok := it.(int32);ok {
            return uint64(v - INT32_MIN)
    
        } else if v,ok := it.(uint32);ok {
            return uint64(v)
    
        }  else if v,ok := it.(uint64);ok {
            return v
    
        } else if v,ok := it.(string);ok {
            return crc64.Checksum([]byte(v), gCrc64Table)
    
        } else {
            data := []byte(fmt.Sprintf("%v", it))
            return crc64.Checksum(data, gCrc64Table)
        }
    }
    
    
    func (me *tCrc64Hasher) Equals(a interface{}, b interface{}) bool {
        if a == nil && b == nil {
            return true
        }
        if a == nil || b == nil {
            return false
        }
    
        fn := me.fnEquals
        if fn == nil {
            return a == b
        } else {
            return fn(a, b)
        }
    }
    
    func NewCrc64Hashful(fn FnEquals) IHasher {
        return &tCrc64Hasher{
            fnEquals: fn,
        }
    }
    

    tHashMap.go

    哈希表的实现, 使用分区(双链)+数组+链表(单链)三维结构

    package hashmap
    
    import (
        "fmt"
        "strings"
    )
    
    type tHashMap struct {
        hasher     IHasher
        partitions *tPartition
        size int
        version int
    }
    
    
    func NewHashMap(hasher IHasher, capacity int) IMap {
        if capacity < 4 {
            capacity = 4
        }
    
        part := newPartition(hasher, capacity)
        return &tHashMap{
            hasher: hasher,
            partitions: part,
            size: 0,
            version: 0,
        }
    }
    
    func (me *tHashMap) Size() int {
        return me.size
    }
    
    func (me *tHashMap) IsEmpty() bool {
        return me.Size() <= 0
    }
    
    func (me *tHashMap) IsNotEmpty() bool {
        return !me.IsEmpty()
    }
    
    func (me *tHashMap) Put(key interface{}, value interface{}) {
        hash := me.hasher.Hash(key)
        ok, _, bucket, node, _ := me.findByKeyAndHash(key, hash)
        if ok {
            bucket.putAt(node, key, value)
    
        } else {
            if me.partitions.nearlyFull() {
                // create new partition
                part := newPartition(me.hasher, int(me.partitions.bucketCount * 2))
                part.next = me.partitions
                me.partitions.prev = part
                me.partitions = part
                part.appendByKeyAndHash(key, value, hash)
            } else {
                me.partitions.appendByKeyAndHash(key, value, hash)
            }
    
            me.size++
        }
    
        me.version++
    }
    
    
    func (me *tHashMap) findByKey(key interface{}) (ok bool, part *tPartition, bucket *tBucket, node *tLinkedNode, prev *tLinkedNode) {
        hash := me.hasher.Hash(key)
        return me.findByKeyAndHash(key, hash)
    }
    
    func (me *tHashMap) findByKeyAndHash(key interface{}, hash uint64) (ok bool, part *tPartition, bucket *tBucket, node *tLinkedNode, prev *tLinkedNode) {
        for part = me.partitions; part != nil; part = part.next {
            ok, bucket, node, prev = part.findByKeyAndHash(key, hash)
            if ok {
                return ok, part, bucket, node, prev
            }
        }
    
        return false, nil, nil, nil, nil
    }
    
    func (me *tHashMap) Get(key interface{}) (bool,interface{}) {
        ok, _, _, node, _ := me.findByKey(key)
        if ok {
            return true, node.value
        } else {
            return false, nil
        }
    }
    
    func (me *tHashMap) Has(key interface{}) bool {
        ok, _, _, _, _ := me.findByKey(key)
        return ok
    }
    
    func (me *tHashMap) Remove(key interface{}) (ok bool, value interface{}) {
        ok, part, bucket, node, prev := me.findByKey(key)
        if ok {
            value = node.value
            bucket.removeAt(node, prev)
    
            // 缩容
            if part.size <= 0 && part != me.partitions {
                me.removePartition(part)
            }
    
            me.size--
            me.version++
            return ok, value
    
        } else {
            return false, nil
        }
    }
    
    func (me *tHashMap) removePartition(part *tPartition) {
        prev := part.prev
        next := part.next
    
        if prev != nil {
            prev.next = next
        }
    
        if next != nil {
            next.prev = prev
        }
    
        if me.partitions == part {
            me.partitions = next
        }
    }
    
    func (me *tHashMap) Clear() {
        if me.IsEmpty() {
            return
        }
    
        part := newPartition(me.hasher, len(me.partitions.buckets))
        me.partitions = part
        me.size = 0
    
        me.version++
    }
    
    
    func (me *tHashMap) Iterator() IMapIterator {
        return newHashMapIterator(me)
    }
    
    func (me *tHashMap) String() string {
        itemStrings := make([]string, 0)
        for p := me.partitions;p != nil;p = p.next {
            itemStrings = append(itemStrings, p.String())
        }
        return fmt.Sprintf("s=%v,v=%v %s", me.size, me.version, strings.Join(itemStrings, " "))
    }
    

    tPartition.go

    哈希表分区. 其大小呈2次幂

    package hashmap
    
    import (
        "fmt"
        "strings"
    )
    
    type tPartition struct {
        hasher IHasher
        buckets []*tBucket
        bucketCount uint64
        prev *tPartition
        next *tPartition
        size int
        threshhold int
    }
    
    func newPartition(hasher IHasher, bucketCount int) *tPartition {
        it := &tPartition{
            hasher: hasher,
            buckets: make([]*tBucket, bucketCount),
            bucketCount: uint64(bucketCount),
            prev: nil,
            next: nil,
            size: 0,
            threshhold: bucketCount * 3 / 4,
        }
        for i,_ := range it.buckets {
            it.buckets[i] = newBucket(hasher)
        }
        return it
    }
    
    func (me *tPartition) putByKey(key interface{}, value interface{}) bool {
        hash := me.hasher.Hash(key)
        return me.putByKeyAndHash(key, value, hash)
    }
    
    func (me *tPartition) putByKeyAndHash(key interface{}, value interface{}, hash uint64) bool {
        if me.getBucketByHash(hash).put(key, value) {
            me.size++
            return true
        }
        return false
    }
    
    func (me *tPartition) appendByKeyAndHash(key interface{}, value interface{}, hash uint64) {
        me.getBucketByHash(hash).append(key, value)
        me.size++
    }
    
    func (me *tPartition) getBucketByKey(key interface{}) *tBucket {
        hash := me.hasher.Hash(key)
        return me.getBucketByHash(hash)
    }
    
    func (me *tPartition) getBucketByHash(hash uint64) *tBucket {
        return me.buckets[int(hash % me.bucketCount)]
    }
    
    func (me *tPartition) get(key interface{}) (bool,interface{}) {
        return me.getBucketByKey(key).get(key)
    }
    
    func (me *tPartition) findByKey(key interface{}) (ok bool,bucket *tBucket,node *tLinkedNode, prev *tLinkedNode) {
        bucket = me.getBucketByKey(key)
        ok,node,prev = bucket.find(key)
        return ok,bucket,node, prev
    }
    
    func (me *tPartition) findByKeyAndHash(key interface{}, hash uint64) (ok bool,bucket *tBucket,node *tLinkedNode, prev *tLinkedNode) {
        bucket = me.getBucketByHash(hash)
        ok,node,prev = bucket.find(key)
        return ok,bucket,node, prev
    }
    
    func (me *tPartition) remove(key interface{}) (bool, value interface{}) {
        ok,node := me.getBucketByKey(key).remove(key)
        if ok {
            me.size--
            return true, node.value
    
        } else {
            return false, nil
        }
    }
    
    func (me *tPartition) nearlyFull() bool {
        return me.size >= me.threshhold
    }
    
    func (me *tPartition) String() string {
        itemStrings := make([]string, 0)
        for i,b := range me.buckets {
            if b.size > 0 {
                itemStrings = append(itemStrings, fmt.Sprintf("%v:%s", i, b.String()))
            }
        }
        return fmt.Sprintf("p[%s]", strings.Join(itemStrings, ","))
    }
    

    tBucket.go

    哈希桶, 每个桶内的任何元素, 哈希值是一致的

    package hashmap
    
    import (
        "fmt"
        "strings"
    )
    
    type tBucket struct {
        hasher IHasher
        nodes  *tLinkedNode
        size int
    }
    
    type tLinkedNode struct {
        key interface{}
        value interface{}
        next *tLinkedNode
    }
    
    func newBucket(hasher IHasher) *tBucket {
        return &tBucket{
            hasher: hasher,
            nodes: nil,
            size: 0,
        }
    }
    
    func newLinkedNode(key interface{}, value interface{}) *tLinkedNode {
        return &tLinkedNode{
            key: key,
            value: value,
            next: nil,
        }
    }
    
    func (me *tBucket) put(key interface{}, value interface{}) bool {
        existed, node, _ := me.find(key)
        me.putAt(node, key, value)
        return !existed
    }
    
    func (me *tBucket) append(key interface{}, value interface{}) {
        me.putAt(nil, key, value)
    }
    
    func (me *tBucket) putAt(node *tLinkedNode, key interface{}, value interface{}) {
        if node != nil {
            node.value = value
            return
        }
    
        it := newLinkedNode(key, value)
        if me.nodes == nil {
            me.nodes = it
    
        } else {
            it.next = me.nodes
            me.nodes = it
        }
    
        me.size++
    }
    
    func (me *tBucket) get(key interface{}) (bool, interface{}) {
        ok, node, _ := me.find(key)
        if ok {
            return true, node.value
        }
        return false, nil
    }
    
    func (me *tBucket) find(key interface{}) (ok bool, node *tLinkedNode, prev *tLinkedNode) {
        prev = nil
        for it:=me.nodes;it != nil;it = it.next {
            if me.hasher.Equals(it.key, key) {
                return true, it, prev
            }
            prev = it
        }
        return false, nil, nil
    }
    
    
    func (me *tBucket) remove(key interface{}) (bool, *tLinkedNode) {
        ok,node, prev := me.find(key)
        if !ok {
            return false, nil
        }
    
        me.removeAt(node, prev)
        return true, node
    }
    
    
    func (me *tBucket) removeAt(node *tLinkedNode, prev *tLinkedNode) {
        if prev == nil {
            me.nodes = node.next
        } else {
            prev.next = node.next
        }
        me.size--
    }
    
    func (me *tBucket) String() string {
        itemStrings := make([]string, me.size)
        i := 0
        for it := me.nodes;it != nil;it = it.next {
            itemStrings[i] = fmt.Sprintf("%v", it.key)
            i++
        }
        return fmt.Sprintf("b[%v %s]", me.size, strings.Join(itemStrings, ","))
    }
    

    tHashMapIterator.go

    哈希表迭代器的实现

    package hashmap
    
    import "errors"
    
    type tHashMapIterator struct {
        hashMap *tHashMap
        pindex *tPartition
        bindex int
        nindex *tLinkedNode
        version int
        visited int
    }
    
    
    func newHashMapIterator(hashMap *tHashMap) IMapIterator {
        return &tHashMapIterator{
            hashMap: hashMap,
            pindex: hashMap.partitions,
            bindex: -1,
            nindex: nil,
            version: hashMap.version,
            visited: 0,
        }
    }
    
    
    func (me *tHashMapIterator) nextNode() *tLinkedNode {
        node := me.nindex
        for {
            if node == nil {
                bkt := me.nextBucket()
                if bkt == nil {
                    return nil
                } else {
                    me.nindex = bkt.nodes
                    return me.nindex
                }
    
            } else {
                node = node.next
                if node != nil {
                    me.nindex = node
                    return node
                }
            }
        }
    }
    
    func (me *tHashMapIterator) nextBucket() *tBucket {
        part := me.pindex
        bi := me.bindex + 1
    
        for {
            if bi >= len(part.buckets) {
                part = me.nextPartition()
                if part == nil {
                    return nil
                }
    
                bi = 0
            }
    
            bkt := part.buckets[bi]
            if bkt.nodes != nil {
                me.bindex = bi
                return bkt
            }
    
            bi++
        }
    }
    
    
    func (me *tHashMapIterator) nextPartition() *tPartition {
        if me.pindex == nil {
            return nil
        }
        me.pindex = me.pindex.next
        return me.pindex
    }
    
    func (me *tHashMapIterator) More() bool {
        if me.version != me.hashMap.version {
            return false
        }
        return me.visited < me.hashMap.size
    }
    
    func (me *tHashMapIterator) Next() (err error, key interface{}, value interface{}) {
        if me.version != me.hashMap.version {
            return gConcurrentModificationError, nil, nil
        }
    
        if !me.More() {
            return gNoMoreElementsError, nil, nil
        }
    
        node := me.nextNode()
        if node == nil {
            return gNoMoreElementsError, nil, nil
    
        } else {
            me.visited++
            return nil, node.key, node.value
        }
    }
    
    var gConcurrentModificationError = errors.New("concurrent modification error")
    var gNoMoreElementsError = errors.New("no more elements")
    

    (end)

    相关文章

      网友评论

        本文标题:手撸golang 基本数据结构与算法 哈希表

        本文链接:https://www.haomeiwen.com/subject/xdlgxltx.html