我正在尝试对一个 hdf5 文件进行排序。我将 hdf5 文件中的数据转换为 2D numpy 数组,并尝试通过稳定算法对其进行排序。插入步骤进行得很顺利。
但是我调试的时候查到,在merge这一步,当左数组或者右数组(保存排序前值的数组)元素被一一赋值给原数组的时候,左右数组也发生了变化,这弄乱了其余的排序,因为在整个块中只重复了一个元素。
这就是我正在使用的:
from collections import deque
import h5py
import sys
filename = sys.argv[1]
with h5py.File(filename, "r") as f:
a_group_key = list(f.keys())[0]
ds_arr = f[a_group_key][()]
MINIMUM= 32
def find_minrun(n):
r = 0
while n >= MINIMUM:
r |= n & 1
n >>= 1
return n + r
def insertion_sort(array, left, right):
for i in range(left+1,right+1):
#element = array[i]
j = i
while array[j-1][11]>array[j][11] and j>left :
array[[j, j-1]] = array[[j-1, j]]
j -= 1
#array[j+1] = element
return array
def merge(array, l, m, r):
array_length1= m - l + 1
array_length2 = r - m
left = []
right = []
for i in range(0, array_length1):
left.append(array[l + i])
for i in range(0, array_length2):
right.append(array[m + 1 + i])
i=0
j=0
k=l
while j < array_length2 and i < array_length1:
if left[i][11] <= right[j][11]:
array[k] = left[i]
i += 1
else:
array[k] = right[j]
j += 1
k += 1
while i < array_length1:
array[k] = left[i]
k += 1
i += 1
while j < array_length2:
array[k] = right[j]
k += 1
j += 1
def tim_sort(array):
n = len(array)
minrun = find_minrun(n)
for start in range(0, n, minrun):
end = min(start + minrun - 1, n - 1)
insertion_sort(array, start, end)
size = minrun
while size < n:
for left in range(0, n, 2 * size):
mid = min(n - 1, left + size - 1)
right = min((left + 2 * size - 1), (n - 1))
merge(array, left, mid, right)
size = 2 * size
tim_sort(ds_arr)
这是正在发生的事情:
例如,对于块大小 = 38(在运行期间计算)
ds_arr:
[(0, 127.062386, 233.88705, 2043.5269, 0.9048116, 0.91757613, 40.91537, 0.03138579, 0.03187117, 0.01391112, 7452.246, 16),
(0, 185.04272, 327.6841, 673.2232, 0.94436234, 0.8363992, 34.514076, 0.0662952, 0.05692514, 0.11432385, 2303.1208, 24),
(0, 215.6241, 213.80653, 1756.7979, 0.89432126, 0.9836176, 36.561146, 0.03355443, 0.03731155, 0.0907836, 6127.373, 123),
... (+16 elements),
(0, 244.40192, 115.47513, 1171.4187, 0.9789816, 0.9536665, 26.107262, 0.04582449, 0.04447512, 0.02585861, 3681.2678, 6),
(0, 252.34537, 376.03607, 1285.4608, 0.93303615, 0.9969181, 34.071854, 0.0423173, 0.04572778, 0.06407942, 4192.847, 18),
(0, 343.2354, 207.10603, 859.55945, 0.93482053, 0.9706468, 35.455666, 0.05554156, 0.05821768, 0.03690968, 2773.9412, 34),
... (+16 elements),
other chunks]
left:
[(0, 127.062386, 233.88705, 2043.5269, 0.9048116, 0.91757613, 40.91537, 0.03138579, 0.03187117, 0.01391112, 7452.246, 16),
(0, 185.04272, 327.6841, 673.2232, 0.94436234, 0.8363992, 34.514076, 0.0662952, 0.05692514, 0.11432385, 2303.1208, 24),
(0, 215.6241, 213.80653, 1756.7979, 0.89432126, 0.9836176, 36.561146, 0.03355443, 0.03731155, 0.0907836, 6127.373, 123),
... (+16 elements)]
right:
[(0, 244.40192, 115.47513, 1171.4187, 0.9789816, 0.9536665, 26.107262, 0.04582449, 0.04447512, 0.02585861, 3681.2678, 6),
(0, 252.34537, 376.03607, 1285.4608, 0.93303615, 0.9969181, 34.071854, 0.0423173, 0.04572778, 0.06407942, 4192.847, 18),
(0, 343.2354, 207.10603, 859.55945, 0.93482053, 0.9706468, 35.455666, 0.05554156, 0.05821768, 0.03690968, 2773.9412, 34),
... (+16 elements)]
sorted ds_arr:
[(0, 244.40192, 115.47513, 1171.4187, 0.9789816, 0.9536665, 26.107262, 0.04582449, 0.04447512, 0.02585861, 3681.2678, 6),
(0, 244.40192, 115.47513, 1171.4187, 0.9789816, 0.9536665, 26.107262, 0.04582449, 0.04447512, 0.02585861, 3681.2678, 6),
(0, 244.40192, 115.47513, 1171.4187, 0.9789816, 0.9536665, 26.107262, 0.04582449, 0.04447512, 0.02585861, 3681.2678, 6),
... (+35 times),
similarly other chunks]
而不是得到一个排序列表。
您可以使用 argsort 和 integer array indexing 根据其中一列对数组进行排序。
>>> a
array([[ -2, -8, 3, -8],
[ 12, 2, -12, -14],
[ 0, -7, 3, 7],
[ -9, 12, -12, 10],
[ 14, -4, -12, 11],
[ 7, 10, 14, 9]], dtype=int64)
>>> indices = a[:,1].argsort()
>>> indices
array([0, 2, 4, 1, 5, 3], dtype=int64)
>>> b[indices]
array([[ -2, -8, 3, -8],
[ 14, -4, -12, 11],
[ 7, 10, 14, 9],
[ 0, -7, 3, 7],
[ -9, 12, -12, 10],
[ 12, 2, -12, -14]], dtype=int64)
>>>