我正在尝试编码一个并行算法的并行版本,该算法采用一个点和一个点列表,并找到列表中与第一个点更接近的点,以将执行时间与串行版本进行比较。问题是运行并行版本需要1分钟以上,而串行版本需要1秒左右。
为了确保并行性效果明显,我正在使用大约1200万点的列表来测试代码。
我的CPU详细信息:
这是两个版本:
公共部分:
type Point struct {
X float64
Y float64
}
func dist(p, q Point) float64 {
return math.Sqrt(math.Pow(p.X-q.X,2)+math.Pow(p.Y-q.Y,2))
}
顺序功能:
func s_argmin(p Point, points_list []Point, i,j int)(int){
best := 0
d := dist(p, points_list[0])
var new_d float64
for k:=i;k<j+1;k++{
new_d = dist(p, points_list[k])
if new_d < d{
d = new_d
best = k
}
}
return best
}
并行功能:
func p_argmin(p Point, points_list []Point, i,j int)(int){
if i==j{
return i
}else{
mid := int((i+j)/2)
var argmin1, argmin2 int
c1 := make(chan int)
c2 := make(chan int)
go func(){
c1 <- p_argmin(p, points_list, i, mid)
}()
go func(){
c2 <- p_argmin(p, points_list, mid+1, j)
}()
argmin1 = <- c1
argmin2 = <- c2
close(c1)
close(c2)
if dist(p,points_list[argmin1])<dist(p,points_list[argmin2]){
return argmin1
}else{
return argmin2
}
}
}
[我也试图限制并行性,使用优化的函数仅在输入大小(j-i)大于一个值时才执行该函数的并行版本,但是串行版本始终是更快的版本。
如何改善并行版本的结果?
无意义的微基准测试产生了毫无意义的结果。
我没有理由相信p_argmin
可能比s_argmin
快。
$ go test micro_test.go -bench=. -benchmem
goos: linux
goarch: amd64
BenchmarkS-4 946197 1263 ns/op 0 B/op 0 allocs/op
--- BENCH: BenchmarkS-4
micro_test.go:81: 1 946197 946197
BenchmarkP-4 3477 302076 ns/op 80958 B/op 843 allocs/op
--- BENCH: BenchmarkP-4
micro_test.go:98: 839 2917203 3477
$
micro_test.go
:
package main
import (
"math"
"sync"
"testing"
)
type Point struct {
X float64
Y float64
}
func dist(p, q Point) float64 {
//return math.Sqrt(math.Pow(p.X-q.X, 2) + math.Pow(p.Y-q.Y, 2))
return math.Sqrt((p.X-q.X)*(p.X-q.X) + (p.Y-q.Y)*(p.Y-q.Y))
}
func s_argmin(p Point, points_list []Point, i, j int) int {
mbm.Lock()
nbm++
mbm.Unlock()
best := 0
d := dist(p, points_list[0])
var new_d float64
for k := i; k < j+1; k++ {
new_d = dist(p, points_list[k])
if new_d < d {
d = new_d
best = k
}
}
return best
}
func p_argmin(p Point, points_list []Point, i, j int) int {
mbm.Lock()
nbm++
mbm.Unlock()
if i == j {
return i
}
mid := int((i + j) / 2)
var argmin1, argmin2 int
c1 := make(chan int)
c2 := make(chan int)
go func() {
c1 <- p_argmin(p, points_list, i, mid)
}()
go func() {
c2 <- p_argmin(p, points_list, mid+1, j)
}()
argmin1 = <-c1
argmin2 = <-c2
if dist(p, points_list[argmin1]) < dist(p, points_list[argmin2]) {
return argmin1
}
return argmin2
}
var (
nbm int
mbm sync.Mutex
)
func BenchmarkS(b *testing.B) {
mbm.Lock()
nbm = 0
mbm.Unlock()
points := make([]Point, 420)
b.ResetTimer()
for N := 0; N < b.N; N++ {
s_argmin(points[0], points, 0, len(points)-1)
}
b.StopTimer()
mbm.Lock()
b.Log(float64(nbm)/float64(b.N), nbm, b.N)
mbm.Unlock()
}
func BenchmarkP(b *testing.B) {
mbm.Lock()
nbm = 0
mbm.Unlock()
points := make([]Point, 420)
b.ResetTimer()
for N := 0; N < b.N; N++ {
p_argmin(points[0], points, 0, len(points)-1)
}
b.StopTimer()
mbm.Lock()
b.Log(float64(nbm)/float64(b.N), nbm, b.N)
mbm.Unlock()
}
纯-[SERIAL]
代码执行流表明,每点评估距离的成本可忽略不计。每36 [ns]
]花费但大约Point
附加组件开销成本考虑在内,而业余爱好者通常倾向于歪曲对Speedup的期望)// ... The [SERIAL] flow of code-execution took 77.095 µs for [10] // --------^^^^^^^^^^-----------------------------------|--------------------- // ... The [PARALLEL] flow of code-execution took 142.563 µs for [10] // ... The [PARALLEL] flow of code-execution took 386.27 µs for [100] // ... The [PARALLEL] flow of code-execution took 4260.941 µs for [1000] // | | | ::: // | | | ::: ns // | | +____ µs // | +_______ ms // +__________ s
鉴于此,执行并行执行流程(拆分和征服)的实例化成本累积了如此巨大的附加成本开销,以至于在这里对于任何合理使用的
[]Point
大小都很难证明其合理性。] >The revised, overhead-strict Amdahl's argument ( not the original one )在这里是有效的(原始表述并没有迫使人们也将hidden>
func SERIAL( aPointToSEEK Point, aListOfPOINTs []Point ){
defer TimeTRACK( time.Now(), "The [SERIAL] flow of code-execution", len( aListOfPOINTs ) )
//
// 2020/03/09 07:17:54 The [SERIAL] flow of code-execution took 120.529 µs for [1]
// 2020/03/09 07:17:28 The [SERIAL] flow of code-execution took 194.565 µs for [10]
// 2020/03/09 07:11:28 The [SERIAL] flow of code-execution took 77.095 µs for [100]
// 2020/03/09 07:12:16 The [SERIAL] flow of code-execution took 260.771 µs for [1000]
// 2020/03/09 07:13:19 The [SERIAL] flow of code-execution took 591.604 µs for [10000]
// 2020/03/09 07:13:57 The [SERIAL] flow of code-execution took 4585.917 µs for [100000]
// 2020/03/09 07:14:33 The [SERIAL] flow of code-execution took 44317.063 µs for [1000000]
// 2020/03/09 07:10:30 The [SERIAL] flow of code-execution took 36141.75 µs for [1000000]
// 2020/03/09 07:15:10 The [SERIAL] flow of code-execution took 554986.415 µs for [10000000]
// 2020/03/09 07:24:10 The [SERIAL] flow of code-execution took 676098.025 µs for [10000000]
// | | | ... ns
// | | +____ µs
// | +_______ ms
// +__________ s
log.Printf( "%s got nearest aPointID# %d", "The [SERIAL] flow of code-execution", s_argmin( aPointToSEEK, aListOfPOINTs, 0, len( aListOfPOINTs ) - 1 ) )
}