（25）Go实现反向索引堆

作者: 哥斯拉啊啊啊哦 | 来源:发表于2019-05-03 20:38 被阅读0次

（25）Go实现反向索引堆
堆(go实现)
MySQL的全文索引Fulltext Index | 包括ngr
七层流量接入系统
Golang之数组和切片
什么是倒排索引
ReverseProxy 反向代理
ElasticSearch初识（二）
python索引原理
container之heap

普通堆（10）Go实现二叉堆-数组实现：https://www.jianshu.com/p/37bca5f2a6e9

为什么要有索引堆？
原因如下主要有以下两个：

1）如上图，对于一组数据来说，经过堆整理后，数据的大多数索引都发生了变化，之后如果还想改变数组内的值，会找不到对应的数据，除非去遍历数组，不过这样太消耗资源
2）如果这组数据中存储的是比较大的数据，如是1个10000字节长度的字符串，每次堆交换位置，也会消耗太大的资源。

解决这个问题的关键就是索引堆，如下图

解读：
1）indexex[]中存储的是data中数据的索引，每次在堆中堆数据进行对比，交换的不是data的数据，
而是data的索引，这样每次只交换索引，资源消耗小速度快；
2）reverse是indexes的反向索引，即查找表，每次要改变data[i]的值，可通过reverse[i]找到
对应indexes的索引

最小反向查找索引堆的实现
// 最小索引堆,从1开始
type minIndexHeap struct {
    Size    int
    Indexes []int
    Data    []int
    Reverse []int
}

func NewMinIndexHeap(capacity int) *minIndexHeap {
    return &minIndexHeap{
        Indexes: make([]int, capacity+1),
        Data:    make([]int, capacity+1),
        Reverse: make([]int, capacity+1),
    }
}

func parent(i int) int {
    if i == 1 {
        return 1
    }
    return i / 2
}

func leftChild(i int) int {
    return 2 * i
}

// 查看data[i]是否存在堆中
func (h *minIndexHeap) Contain(i int) bool {
    // 索引超出范围,或者data[i]不存在堆中
    if i < 1 || i >= len(h.Reverse) || h.Reverse[i] == 0 {
        return false
    }
    return true
}

func (h *minIndexHeap) GetSize() int {
    return h.Size
}

func (h *minIndexHeap) IsEmpty() bool {
    return h.Size == 0
}

func (h *minIndexHeap) GetMin() (int, error) {
    if h.IsEmpty() {
        return 0, errors.New(
            "failed to getMin,heap is empty")
    }
    return h.Data[h.Indexes[1]], nil
}

// 上浮
func (h *minIndexHeap) shiftUp(childI int) {
    parI := parent(childI)
    // 1 <= parI <= childI <= h.size
    for h.Data[h.Indexes[parI]] > h.Data[h.Indexes[childI]] {
        h.Indexes[parI], h.Indexes[childI] = h.Indexes[childI], h.Indexes[parI]
        h.Reverse[h.Indexes[parI]], h.Reverse[h.Indexes[childI]] = parI, childI
        childI = parI
        parI = parent(parI)
    }
}

// 下沉 左闭右闭
func (h *minIndexHeap) shiftDown(parI int) {
    for {
        var minI int
        leftI := leftChild(parI)
        switch {
        // 左索引超出size
        case leftI > h.Size:
            return
            // 左索引不超,右索引超出size,说明左索引是最后索引
        case leftI+1 > h.Size:
            if h.Data[h.Indexes[parI]] > h.Data[h.Indexes[leftI]] {
            h.Indexes[parI], h.Indexes[leftI] = h.Indexes[leftI], h.Indexes[parI]

            h.Reverse[h.Indexes[parI]], h.Reverse[h.Indexes[leftI]] = parI, leftI
            }
            return
            // 跟左右中小的做比较
        case h.Data[h.Indexes[leftI]] <= h.Data[h.Indexes[leftI+1]]:
            minI = leftI
        default: // h.data[h.index[leftI]] > h.data[h.index[leftI+1]]
            minI = leftI + 1
        }

        // 比左右子节点的值都小,返回
        if h.Data[h.Indexes[parI]] < h.Data[h.Indexes[minI]] {
            return
        }

        h.Indexes[parI], h.Indexes[minI] = h.Indexes[minI], h.Indexes[parI]
        h.Reverse[h.Indexes[parI]], h.Reverse[h.Indexes[minI]] = parI, minI
        parI = minI
    }
}

// 插入元素
func (h *minIndexHeap) InsertItem(item int) error {
    // 从索引1开始,capacity=10,则len(data)=11
    if h.Size >= len(h.Data) {
        fmt.Println("failed to insertItem,heap is full.")
        return errors.New("heap is full.")
    }

    h.Size++
    // ==0则说明之前这个位置没插入过元素
    if h.Indexes[h.Size] == 0 {
        h.Indexes[h.Size] = h.Size
    }

    h.Data[h.Indexes[h.Size]] = item
    h.Reverse[h.Indexes[h.Size]] = h.Size

    childI := h.Size
    h.shiftUp(childI)

    return nil
}

// 取出元素
func (h *minIndexHeap) ExtractMin() (int, error) {
    if h.IsEmpty() {
        return 0, errors.New(
            "failed to getMax,heap is empty")
    }
    retVal := h.Data[h.Indexes[1]]

    h.Data[h.Indexes[1]], h.Data[h.Indexes[h.Size]] =
        h.Data[h.Indexes[h.Size]], 0

        // index从1开始存数据,索引0表示不存在,被删除
    h.Reverse[h.Indexes[h.Size]] = 0

    h.Size--
    h.shiftDown(1)
    return retVal, nil
}

// 改变指定data[i]的值
func (h *minIndexHeap) Change(i, newItem int) error {
    if h.IsEmpty() {
        return errors.New(
            "failed to change,heap is empty")
    }

    // 索引超出范围,或者i对应的索引不存在
    if !h.Contain(i) {
        return errors.New(
            "failed to change ,index is illegal")
    }

    // 能到这步说明索引存在且合法
    if h.Data[i] > newItem {
        h.Data[i] = newItem
        h.shiftUp(h.Reverse[i])
    } else if h.Data[i] < newItem {
        h.Data[i] = newItem
        h.shiftDown(h.Reverse[i])
    }

    return nil
}

// 打印二叉堆
func (h *minIndexHeap) Print() {
    count := 1
    j := 2
    var k uint = 1
    i := 1
    for i <= h.Size {
        fmt.Printf("%d层: ", k)
        for count < j {
            fmt.Printf(" %v ", h.Data[h.Indexes[i]])
            count++
            i++
            if i > h.Size {
                break
            }
        }
        fmt.Println()
        count = 0
        j = 1 << k
        k++
    }
}

测试
func main() {
    a := indexminheap1.NewMinIndexHeap(10)

    for i := 0; i < 8; i++ {
        a.InsertItem(rand.Intn(50) + 10)
    }
    fmt.Println("初始值:")
    a.Print()
    fmt.Println("=========")

    for i := 0; i < 3; i++ {
        a.ExtractMin()
    }
    fmt.Println("取出3个值:")
    a.Print()
    fmt.Println("=========")

    for i := 0; i < 4; i++ {
        a.InsertItem(rand.Intn(50) + 51)

    }
    fmt.Println("添加4个值:")
    a.Print()
    fmt.Println(*a)
    fmt.Println("=========")

    err := a.Change(4, 9)
    if err != nil {
        fmt.Println(err)
        return
    }
    fmt.Println("索引4改为9:")
    a.Print()
    fmt.Println(*a)
    fmt.Println("=========")

    fmt.Println("索引2改为510:")
    err = a.Change(2, 510)
    if err != nil {
        fmt.Println(err)
        return
    }
    a.Print()
    fmt.Println(*a)
}

测试结果 //
初始值:
1层:  19 
2层:  41  28 
3层:  47  41  57  35 
4层:  50 
=========
取出3个值:
1层:  41 
2层:  41  50 
3层:  47  57 
=========
添加4个值:
1层:  41 
2层:  41  50 
3层:  47  57  57  51 
4层:  95  62 
{9 [0 1 5 6 2 7 3 4 8 9 0] [0 41 47 57 51 41 50 57 95 62 0] [0 1 4 6 7 2 3 5 8 9 0]}
=========
索引4改为9:
1层:  9 
2层:  41  41 
3层:  47  57  57  50 
4层:  95  62 
{9 [0 4 5 1 2 7 3 6 8 9 0] [0 41 47 57 9 41 50 57 95 62 0] [0 3 4 6 1 2 7 5 8 9 0]}
=========
索引2改为510:
1层:  9 
2层:  41  41 
3层:  62  57  57  50 
4层:  95  510 
{9 [0 4 5 1 9 7 3 6 8 2 0] [0 41 510 57 9 41 50 57 95 62 0] [0 3 9 6 1 2 7 5 8 4 0]}

总结：代码的难度在于维护好indexes，data，reverse这个3个表的关系。//
比如（1）先添加5个元素，取出2个元素，再添加3个元素，这种情况要处理好indexes表。
网上很多代码在（1）这种情况会有堆中数据不准确的bug。
我的思路是：都从索引1开始，0表示不存在，在取出操作中，reverse[i]=0表示该节点不存在，
indexes[i]中的值不变，在下次重新添加到这一步时，把值更新到data[indexes[i]]中，如果
indexes[i]=0，令indexes[i]=size，即当前data[i]的索引

有bug欢迎指出，转载请注明出处。