删除已排序 Go 切片重复项的最快方法

Question

    mySlice := make([]uint32, 0, 4294967290)

// ...

        // Sort the slice
        sort.Slice(mySlice, func(i, j int) bool {
            x := mySlice[i]
            y := mySlice[j]
            return x < y
        })

删除切片重复项的最快方法是什么？

如何利用切片已排序的事实？

更新

我想出了这个：

func RemoveDuplicates(s []uint32) {
    if len(s) < 2 {
        return
    }
    tmp := make([]uint32, 0, len(s))

    for i := uint32(0); i < uint32(len(s)); i++ {
        // If current is not equal to next then store the current
        if s[i] != s[i+1] {
            tmp = append(tmp, s[i])
        }
    }

    // The last must be stored
    // Note that if it was repeated, the duplicates are NOT stored before
    tmp = append(tmp, s[len(s)-1])

    // Modify original slice
    s = nil
    s = append(s, tmp...)
}

有什么错误吗？有什么错误吗？有什么办法可以改善吗？

更新

正如 @mh-cbon 所指出的，正确的循环最大值应该是

i < uint32(len(s)) - 1

:

for i := uint32(0); i < uint32(len(s)) - 1; i++ {

Answer 1

不是最快的答案，而是逐步介绍使用 Go 语言优化一段代码的方法。

有关最快的非常正式的见解，请参阅 https://stackoverflow.com/a/6072100/4466350

您的代码有错误。总是写一个测试。

首先，让我们编写一个main

package main

import (
    "math/rand"
    "sort"
)

func main() {
}

func randSlice(max int) (ret []uint32) {
    // we should check that max does not exceed maxUINT32
    ret = make([]uint32, 0, max)
    r := rand.New(rand.NewSource(99))
    for i := 0; i < max; i++ {
        ret = append(ret, uint32(r.Intn(max)))
    }
    sort.Slice(ret, func(i, j int) bool {
        return ret[i] < ret[j]
    })
    return
}

func dedup1(s []uint32) []uint32 {
    if len(s) < 2 {
        return s
    }
    tmp := make([]uint32, 0, len(s))

    for i := uint32(0); i < uint32(len(s)); i++ {
        // If current is not equal to next then store the current
        if s[i] != s[i+1] {
            tmp = append(tmp, s[i])
        }
    }

    // The last must be stored
    // Note that if it was repeated, the duplicates are NOT stored before
    tmp = append(tmp, s[len(s)-1])

    // Modify original slice
    s = nil
    s = append(s, tmp...)
    return s
}

以及附带的测试

package main

import "testing"

func TestDedup1(t *testing.T) {
    s := randSlice(10)
    res := dedup1(s)
    uniq := map[uint32]bool{}
    for _, r := range res {
        _, ok := uniq[r]
        if ok {
            t.Fatalf("found duplicates\ninput=%#v\nresult=%#v\n", s, res)
        }
        uniq[r] = true
    }
}

运行这个我们得到

$ go test -v . 
=== RUN   TestDedup1
--- FAIL: TestDedup1 (0.00s)
panic: runtime error: index out of range [10] with length 10 [recovered]
    panic: runtime error: index out of range [10] with length 10

goroutine 18 [running]:
testing.tRunner.func1.1(0x536680, 0xc0000da040)
    /home/mh-cbon/.gvm/gos/go1.15.2/src/testing/testing.go:1076 +0x30d
testing.tRunner.func1(0xc000082600)
    /home/mh-cbon/.gvm/gos/go1.15.2/src/testing/testing.go:1079 +0x41a
panic(0x536680, 0xc0000da040)
    /home/mh-cbon/.gvm/gos/go1.15.2/src/runtime/panic.go:969 +0x175
test/d/dup.dedup1(0xc000094060, 0xa, 0xa, 0xa, 0x6124a0, 0xc00003c770)
    /home/mh-cbon/gow/src/test/d/dup/main.go:32 +0x248
test/d/dup.TestDedup1(0xc000082600)
    /home/mh-cbon/gow/src/test/d/dup/main_test.go:7 +0x70
testing.tRunner(0xc000082600, 0x54fbf0)
    /home/mh-cbon/.gvm/gos/go1.15.2/src/testing/testing.go:1127 +0xef
created by testing.(*T).Run
    /home/mh-cbon/.gvm/gos/go1.15.2/src/testing/testing.go:1178 +0x386
FAIL    test/d/dup  0.006s
FAIL

我们通过适当检查切片边界来修复此问题。

在

dedup1

中，将此条件

if s[i] != s[i+1] {

更改为

if i+1 < uint32(len(s)) && s[i] != s[i+1] {

，或者更好的是，将迭代最大值减少 1

for i := uint32(0); i < uint32(len(s))-1; i++ {

接下来，编写一个函数来生成具有随机重复项的切片。

func randSliceWithDups(max int) (ret []uint32) {
    ret = randSlice(max / 2)
    ret = append(ret, ret...)
    sort.Slice(ret, func(i, j int) bool {
        return ret[i] < ret[j]
    })
    return
}

书写

randSliceWithDups(50)

得到切片，例如

[0 0 1 1 2 2 3 3 3 3 4 4 5 5 5 5 6 6 6 6 7 7 8 8 9 9 9 9 12 12 13 13 18 18 18 18 18 18 19 19 20 20 20 20 21 21 22 22 24 24]

再次更新测试

func TestDedup1_with_dups(t *testing.T) {
    s := randSliceWithDups(10)
    res := dedup1(s)
    uniq := map[uint32]bool{}
    for _, r := range res {
        _, ok := uniq[r]
        if ok {
            t.Fatalf("found duplicates\ninput=%#v\nresult=%#v\n", s, res)
        }
        uniq[r] = true
    }
}

接下来，添加一个基准；它将帮助我们发现性能问题并随着时间的推移保持性能。

func BenchmarkDedup1_1000(b *testing.B) {
    s := randSliceWithDups(100)
    b.ResetTimer()
    b.ReportAllocs()
    for i := 0; i < b.N; i++ {
        _ = dedup1(s)
    }
}

运行这个我们得到：

$ go test -v . -bench=.
=== RUN   TestDedup1
--- PASS: TestDedup1 (0.00s)
=== RUN   TestDedup1_with_dups
--- PASS: TestDedup1_with_dups (0.00s)
goos: linux
goarch: amd64
pkg: test/d/dup
BenchmarkDedup1_1000
BenchmarkDedup1_1000-4        172087          6353 ns/op        5504 B/op          2 allocs/op
PASS
ok      test/d/dup  1.174s

让我们声明一个明显的事实，每个人都发现阅读您的初始代码，甚至没有编写基准，您正在分配。

这就提出了一个问题，即确定是否允许您就地修改输入切片。如果您可以就地更改它，我们可能会利用这一点来防止分配并加快您的程序。

从头开始编写的一个解决方案（考虑在 geekforgeeks 等网站上搜索普遍接受的解决方案）是迭代切片并维护下一个要写入的位置的索引。当找到非重复项时，将非重复项保存到最后一个位置，然后将该索引增加 1。最后，将切片返回到递增索引的值。

func dedup2(s []uint32) []uint32 {
    if len(s) < 2 {
        return s
    }

    var e int
    for i := 1; i < len(s); i++ {
        if s[i] == s[i-1] {
            continue
        }
        s[e] = s[i]
        e++
    }

    return s[:e]
}

再次添加测试和工作台，并检查结果。

func TestDedup2(t *testing.T) {
    s := randSlice(10)
    res := dedup2(s)
    uniq := map[uint32]bool{}
    for _, r := range res {
        _, ok := uniq[r]
        if ok {
            t.Fatalf("found duplicates\ninput=%#v\nresult=%#v\n", s, res)
        }
        uniq[r] = true
    }
}

func TestDedup2_with_dups(t *testing.T) {
    s := randSliceWithDups(10)
    res := dedup2(s)
    uniq := map[uint32]bool{}
    for _, r := range res {
        _, ok := uniq[r]
        if ok {
            t.Fatalf("found duplicates\ninput=%#v\nresult=%#v\n", s, res)
        }
        uniq[r] = true
    }
}

func BenchmarkDedup2_1000(b *testing.B) {
    s := randSliceWithDups(100)
    b.ResetTimer()
    b.ReportAllocs()
    for i := 0; i < b.N; i++ {
        _ = dedup2(s)
    }
}

其产量：

$ go test -v . -bench=.
=== RUN   TestDedup1
--- PASS: TestDedup1 (0.00s)
=== RUN   TestDedup1_with_dups
--- PASS: TestDedup1_with_dups (0.00s)
=== RUN   TestDedup2
--- PASS: TestDedup2 (0.00s)
=== RUN   TestDedup2_with_dups
--- PASS: TestDedup2_with_dups (0.00s)
goos: linux
goarch: amd64
pkg: test/d/dup
BenchmarkDedup1_1000
BenchmarkDedup1_1000-4       1764574           673 ns/op         544 B/op          2 allocs/op
BenchmarkDedup2_1000
BenchmarkDedup2_1000-4       7758907           152 ns/op           0 B/op          0 allocs/op
PASS
ok      test/d/dup  3.224s

提升 4 倍！凉爽的！下一步是什么？接下来是找到一种更好的算法，它可以减少执行次数、查找次数等。

但是，最后一个版本包含一个错误！你发现了吗？

参见这个测试，它比另一个测试更好，因为它不依赖于随机数，而是依赖于具有强大相等性检查的静态值。使用这些类型的测试，您可以定制您的输入来检查细粒度的情况。

func TestDedup2_static(t *testing.T) {
    type expectation struct {
        input []uint32
        want  []uint32
    }

    expectations := []expectation{
        expectation{
            input: []uint32{0, 0, 1, 2, 3, 3, 3, 4, 4, 5},
            want:  []uint32{0, 1, 2, 3, 4, 5},
        },
        expectation{
            input: []uint32{0, 1, 2, 3, 3, 3, 4, 4, 5},
            want:  []uint32{0, 1, 2, 3, 4, 5},
        },
    }

    for _, e := range expectations {
        res := dedup2(e.input)
        if !reflect.DeepEqual(res, e.want) {
            t.Fatalf("invlaid result, wanted=%#v\ngot=%#v\n", e.want, res)
        }
    }
}

它使用表驱动测试，如https://dave.cheney.net/2013/06/09/writing-table-driven-tests-in-go

所述

让我们解决这个问题：

func dedup2(s []uint32) []uint32 {
    if len(s) < 2 {
        return s
    }

    var e int = 1
    for i := 1; i < len(s); i++ {
        if s[i] == s[i-1] {
            continue
        }
        s[e] = s[i]
        e++
    }

    return s[:e]
}

Answer 2

要删除切片的重复元素，您可以创建一个映射并将切片值分配为映射的键，然后迭代映射并将键值附加到新切片。以下是相同逻辑的代码：

package main

import (
    "fmt"
    "sort"
)

func removeDupe(slc []int) []int {
    var tmpSlice []int
    chkMap := make(map[int]bool)

    for _, val := range slc {
        chkMap[val] = true
    }

    for k, _ := range chkMap {
        tmpSlice = append(tmpSlice, k)
    }
    sort.Ints(tmpSlice)
    return tmpSlice
}

func main() {

    mySlice := []int{1, 2, 3, 4, 5, 1, 2, 3, 9, 0}
    formattedSlice := removeDupe(mySlice)

    fmt.Println(formattedSlice)

}

输出：

[0 1 2 3 4 5 9]

Answer 3

下面的通用方法怎么样，这适用于任何可比较的类型，并简单地检查该项目是否已经在地图中，如果不是，则假设它是唯一的并将其添加到我们的最终数组中。

这避免了上面迭代两个数组的方法，这样我们只迭代一个数组。

package main

import "fmt"

func RemoveDuplicates[T comparable](slice []T) []T {
    itemMap := make(map[T]bool, len(slice))
    items := make([]T, 0)
    for _, item := range slice {
        if _, ok := itemMap[item]; !ok {
            items = append(items, item)
            itemMap[item] = true
        }
    }
    return items
}

func main() {
    duplicateStr := []string{"a", "a", "b", "c"}
    dedupeStr := RemoveDuplicates(duplicateStr)
    fmt.Printf("%v\n", dedupeStr)

    duplicateInts := []int{0, 1, 2, 3, 3, 2}
    dedupeInts := RemoveDuplicates(duplicateInts)
    fmt.Printf("%v\n", dedupeInts)
}

这保持了数组的顺序。

[a b c]
[0 1 2 3]

Golang 游乐场：https://go.dev/play/p/9Y8AHe2AkIM

如果您想对指针数组进行重复数据删除，只要指向的每个对象都实现 .String() 方法即可。

package main

import "fmt"

type Stringer interface {
    String() string
}

// Remove all duplicate objects in an array of pointers
func RemoveDuplicateStringers[T Stringer](slice []*T) []*T {
    itemMap := make(map[string]*T)
    items := make([]*T, 0)
    for _, item := range slice {
        if item != nil {
            key := (*item).String()
            if _, ok := itemMap[key]; !ok {
                itemMap[key] = item
                items = append(items, item)
            }
        }
    }
    return items
}

func main() {
    // Use net.IP as a reference
    ip1 := net.ParseIP("1.1.1.1")
    ip2 := net.ParseIP("1.0.0.1")
    ip3 := net.ParseIP("1.1.1.1")
    ip4 := net.ParseIP("1.2.3.4")
    ip5 := net.ParseIP("1.1.1.1")
    ip6 := net.ParseIP("1.0.0.1")
    ip7 := net.ParseIP("1.2.3.4")
    duplicates := []*net.IP{&ip1, &ip2, &ip3, &ip4, &ip5, &ip6, &ip7}
    dedupe := RemoveDuplicateStringers(duplicates)
    fmt.Printf("%s", dedupe)
}

这里也保留了数组的顺序

[1.1.1.1 1.0.0.1 1.2.3.4]

Goland 游乐场：https://go.dev/play/p/Q5yr_2FxQ_H

Answer 4

您现在可以使用标准库了。尝试这样：

import (
        "slices"
        "sort"
        "rand"
        )

    func main(){
        ids := make([]int,100)
        for i := range ids {
            ids[i] = rand.Int()
        }
        // now ids is random filled slice
        sort.Ints(ids)
        ids = slices.Compact(ids)
        // now ids sorted an uniq
    }

删除已排序 Go 切片重复项的最快方法

问题描述投票：0回答：4

更新

更新

4个回答

最新问题

删除已排序 Go 切片重复项的最快方法

问题描述 投票：0回答：4

更新

更新

4个回答

最新问题

问题描述投票：0回答：4