解压缩张量流中的张量层的位时性能较慢

问题描述 投票:0回答:1

我正在处理与Starcraft 2客户端通过WebSocket连接获得的数据,以从正在进行的游戏中获取图像数据。在某些情况下,可以将图像数据设置为每个像素1位的格式。发生这种情况时,我需要对响应中每个字节的位进行“拆包”(1个字节=> 8位)。这是在下面的代码中完成的:

function unpackbits(uint8data) {
  const results = new Uint8Array(8 * uint8data.length)
  let byte
  let offset
  for (let i = 0; i < uint8data.length; i++) {
    byte = uint8data[i]
    offset = (8 * i)
    results[offset + 7] = ((byte & (1 << 0)) >> 0)
    results[offset + 6] = ((byte & (1 << 1)) >> 1)
    results[offset + 5] = ((byte & (1 << 2)) >> 2)
    results[offset + 4] = ((byte & (1 << 3)) >> 3)
    results[offset + 3] = ((byte & (1 << 4)) >> 4)
    results[offset + 2] = ((byte & (1 << 5)) >> 5)
    results[offset + 1] = ((byte & (1 << 6)) >> 6)
    results[offset + 0] = ((byte & (1 << 7)) >> 7)
  }
  return results
}

这将像这样输入张量:

 static unpack_layer(plane) {
    //Return a correctly shaped tensor given the feature layer bytes.//

    const size = point.Point.build(plane.getSize()) // { x, y }

    if (plane.getBitsPerPixel() === 1) {
      data = unpackbits(data)
      if (data.length !== (size.x * size.y)) {
        // This could happen if the correct length isn't a multiple of 8, leading
        // to some padding bits at the end of the string which are incorrectly
        // interpreted as data.
        data = data.slice(0, size.x * size.y)
      }
    }

    data = tf.tensor(data, [size.y, size.x], 'int32')
    return data
}

在我的一项测试中,该代码运行了1900次,并执行了0.0737s。

这非常慢。

为了进行比较,python中的等效功能需要0.0209s来运行1900次。 python代码如下所示:

  def unpack_layer(plane):
    """Return a correctly shaped numpy array given the feature layer bytes."""

    size = point.Point.build(plane.size) # {x, y }
    data = np.frombuffer(plane.data, dtype=Feature.dtypes[plane.bits_per_pixel])

    if plane.bits_per_pixel == 1:
      data = np.unpackbits(data)
      if data.shape[0] != size.x *  size.y:
        # This could happen if the correct length isn't a multiple of 8, leading
        # to some padding bits at the end of the string which are incorrectly
        # interpreted as data.
        data = data[:size.x * size.y]
    return data.reshape(size.y, size.x)

简而言之,JavaScript版本大约是python版本的4倍。

我将查看numpy的unpackbits文档,因为这似乎比我自己的方法要有效得多-

但是,我想知道是否有人对如何更好地优化自己的解包功能或更好的让TensorFlow为我做到这一点有任何想法?

javascript bitmap bit-manipulation tensorflow.js
1个回答
1
投票
但是,一个建议是创建一个256个大小为8的Uint8Array的数组,并用8字节转换的完整列表预先填充它。这大大减少了字节流的重复计算,该字节流可能具有0-255范围内的重复值。例如,预计算数组中的第一个条目表示字节0的解包,因此是大小为8的Uint8Array,其中填充了0,下一个条目是另一个大小为8的Uint8Array,填充有00000001,依此类推,直到代表字节255的条目为止,是一个大小为8的Uint8Array,其中填充了全1。

然后,在解压缩时,只需使用类型化数组.set方法将预先计算的解压缩表示形式复制到results Uint8Array ...

希望这会有所帮助。

EDIT创建了许多解压缩算法的变体,以测试内联计算与内存查找的性能,并对使用Chrome的结果感到惊讶。 V8编译器的某些优化是不直观的...

版本差异...

unpackbits [FAST]:从原始问题开始,这是其他变量进行比较的标准。

    unpackbits1 [更快]:修改为...
  • 在每个整数后指定“ | 0”。更改
    • 使用增量一元运算符(“ ++”),而不是将增量添加到offset数组的result索引。
    • 用实际值代替位掩码的计算。 (即,使用的功能不是1 << 5,而是32。)
  • unpackbits1a [最快]:由...修改
  • 在每个整数后指定“ | 0”。
    • 使用增量一元运算符(“ ++”),而不是将增量添加到offset数组的result索引。
    • 但是保留位掩码的计算。 (即,使用1 << 5而不是32。)
    • 直觉上,这会产生更快的结果!
  • unpackbits1b [SLOWER]:与unpackbits1a相同,除了...
  • offset不会在循环内每次都重新计算。即,首先设置offset = 0|0,然后在循环内仅将offset递增。因此,不再为每个字节计算offset = ( 8|0 * i )
      直觉上,这会产生较慢的结果!
  • unpackbits2 [SL​​OWEST]:这是我上面推荐的内存查找选项。 从直觉上来说,这意味着类型化数组存储操作比unpackbits中的计算结果要慢得多!
  • unpackbits3 [SLOWER]:这是我上面建议的内存查找选项,但有以下更改。
  • 而不是使用类型化数组.set方法,此版本将八个字节一一设置。
      直观上来说,这意味着类型化数组.set方法比单独设置值要慢(至少八个字节)!]]
  • unpackbits4 [FAST]:算法的这种变化与原始变化相当,并且是内存查找选项的变化。但是,它不是将256个单独的Uint8Array组合在一起,而是将所有预先计算的结果合并为一个长度为256 * 8的Uint8Array。并且它没有使用类型化数组.set方法。
  • unpackbits5 [SLOWER]:与unpackbits4相同,除了...
  • 而不是对查找表中的索引使用一元“ ++”,而是为要复制的8个字节中的每一个计算索引。不出所料,每次计算索引比使用一元“ ++”运算符要慢。
    • 这里是测试。请注意,这将构建一个初始的10M随机字节数组,然后在此相同数据上运行每个拆包算法。在我的工作站上,测试运行时间不到5秒。
  • var lookupTable = initializeLookupTable(); function initializeLookupTable() { let lookup = new Array( 256 ); let v = new Uint8Array( 1 ); for ( let i = 0; i < 256; i++ ) { v[ 0 ] = i; lookup[ i ] = unpackbits( v ); } return lookup; } var lookupTable4 = initializeLookupTable4(); function initializeLookupTable4() { let lookup = new Uint8Array( 256 * 8 ); let v = new Uint8Array( 1 ); for ( let i = 0; i < 256; i++ ) { v[ 0 ] = i; let temp = unpackbits( v ); lookup.set( temp, i * 8 ); } return lookup; } function unpackbits(uint8data) { const results = new Uint8Array(8 * uint8data.length) let byte let offset for (let i = 0; i < uint8data.length; i++) { byte = uint8data[i] offset = (8 * i); results[offset + 7] = ((byte & (1 << 0)) >> 0) results[offset + 6] = ((byte & (1 << 1)) >> 1) results[offset + 5] = ((byte & (1 << 2)) >> 2) results[offset + 4] = ((byte & (1 << 3)) >> 3) results[offset + 3] = ((byte & (1 << 4)) >> 4) results[offset + 2] = ((byte & (1 << 5)) >> 5) results[offset + 1] = ((byte & (1 << 6)) >> 6) results[offset + 0] = ((byte & (1 << 7)) >> 7) } return results } function unpackbits1(uint8data) { const results = new Uint8Array(8 * uint8data.length) let byte; let offset; for (let i = 0|0, n = uint8data.length; i < n; i++) { byte = uint8data[i] offset = (8|0 * i); // The "|0" on this line cut's the time almost in half! results[offset++] = (byte & 128|0)>>7|0; results[offset++] = (byte & 64|0)>>6|0; results[offset++] = (byte & 32|0)>>5|0; results[offset++] = (byte & 16|0)>>4|0; results[offset++] = (byte & 8|0)>>3|0; results[offset++] = (byte & 4|0)>>2|0; results[offset++] = (byte & 2|0)>>1|0; results[offset++] = (byte & 1|0)>>0|0; } return results } function unpackbits1a(uint8data) { const results = new Uint8Array(8 * uint8data.length) let byte; let offset; for (let i = 0|0, n = uint8data.length; i < n; i++) { byte = uint8data[i] offset = (8|0 * i); // The "|0" on this line cut's the time almost in half! results[offset++] = (byte & (1|0 << 7|0))>>7|0; results[offset++] = (byte & (1|0 << 6|0))>>6|0; results[offset++] = (byte & (1|0 << 5|0))>>5|0; results[offset++] = (byte & (1|0 << 4|0))>>4|0; results[offset++] = (byte & (1|0 << 3|0))>>3|0; results[offset++] = (byte & (1|0 << 2|0))>>2|0; results[offset++] = (byte & (1|0 << 1|0))>>1|0; results[offset++] = (byte & 1|0); } return results } function unpackbits1b(uint8data) { const results = new Uint8Array(8 * uint8data.length) let byte; let offset = 0|0; for (let i = 0|0, n = uint8data.length; i < n; i++) { byte = uint8data[i]; results[offset++] = (byte & (1|0 << 7|0))>>7|0; results[offset++] = (byte & (1|0 << 6|0))>>6|0; results[offset++] = (byte & (1|0 << 5|0))>>5|0; results[offset++] = (byte & (1|0 << 4|0))>>4|0; results[offset++] = (byte & (1|0 << 3|0))>>3|0; results[offset++] = (byte & (1|0 << 2|0))>>2|0; results[offset++] = (byte & (1|0 << 1|0))>>1|0; results[offset++] = (byte & 1|0); } return results } function unpackbits2( uint8data ) { const result = new Uint8Array( 8 * uint8data.length ); for ( let i = 0|0, ri = 0|0, n = uint8data.length; i < n; i++, ri += 8 ) { result.set( lookupTable[ uint8data[ i ] ], ri ); } return result; } function unpackbits3( uint8data ) { const result = new Uint8Array( 8 * uint8data.length ); let ri = 0|0; for ( let i = 0|0, n = uint8data.length; i < n; i++ ) { //result.set( lookupTable[ uint8data[ i ] ], ri ); let lv = lookupTable[ uint8data[ i ] ]; result[ ri++ ] = lv [ 0|0 ]; result[ ri++ ] = lv [ 1|0 ]; result[ ri++ ] = lv [ 2|0 ]; result[ ri++ ] = lv [ 3|0 ]; result[ ri++ ] = lv [ 4|0 ]; result[ ri++ ] = lv [ 5|0 ]; result[ ri++ ] = lv [ 6|0 ]; result[ ri++ ] = lv [ 7|0 ]; } return result; } function unpackbits4( uint8data ) { const result = new Uint8Array( 8 * uint8data.length ); let ri = 0|0; for ( let i = 0|0, n = uint8data.length; i < n; i++ ) { let li = uint8data[ i ] * 8|0; result[ ri++ ] = lookupTable4[ li++ ]; result[ ri++ ] = lookupTable4[ li++ ]; result[ ri++ ] = lookupTable4[ li++ ]; result[ ri++ ] = lookupTable4[ li++ ]; result[ ri++ ] = lookupTable4[ li++ ]; result[ ri++ ] = lookupTable4[ li++ ]; result[ ri++ ] = lookupTable4[ li++ ]; result[ ri++ ] = lookupTable4[ li++ ]; } return result; } function unpackbits5( uint8data ) { const result = new Uint8Array( 8 * uint8data.length ); let ri = 0|0; for ( let i = 0|0, n = uint8data.length; i < n; i++ ) { let li = uint8data[ i ] * 8|0; result[ ri++ ] = uint8data[ li ]; result[ ri++ ] = uint8data[ li+1 ]; result[ ri++ ] = uint8data[ li+2 ]; result[ ri++ ] = uint8data[ li+3 ]; result[ ri++ ] = uint8data[ li+4 ]; result[ ri++ ] = uint8data[ li+5 ]; result[ ri++ ] = uint8data[ li+6 ]; result[ ri++ ] = uint8data[ li+7 ]; } return result; } // Test console.log( 'Building array of 10,000,000 test values.' ); let testArray = new Uint8Array( 10000000 ); for ( let i = 0; i < testArray.length; i++ ) { testArray[ i ] = Math.floor( 256 * Math.random() ); } console.log( 'Finished building test values.' ); console.log( 'Starting unpackbits.' ); console.time('u'); let u = unpackbits( testArray ); console.timeEnd('u'); console.log( 'Finished unpackbits.' ); console.log( 'Starting unpackbits1.' ); console.time('u1'); u = unpackbits1( testArray ); console.timeEnd('u1'); console.log( 'Finished unpackbits1.' ); console.log( 'Starting unpackbits1a.' ); console.time('u1a'); u = unpackbits1a( testArray ); console.timeEnd('u1a'); console.log( 'Finished unpackbits1a.' ); console.log( 'Starting unpackbits1b.' ); console.time('u1b'); u = unpackbits1b(testArray ); console.timeEnd('u1b'); console.log( 'Finished unpackbits1b.' ); console.log( 'Starting unpackbits2.' ); console.time('u2'); u = unpackbits2( testArray ); console.timeEnd('u2'); console.log( 'Finished unpackbits2.' ); console.log( 'Starting unpackbits3.' ); console.time('u3'); u = unpackbits3( testArray ); console.timeEnd('u3'); console.log( 'Finished unpackbits3.' ); console.log( 'Starting unpackbits4.' ); console.time('u4'); u = unpackbits4( testArray ); console.timeEnd('u4'); console.log( 'Finished unpackbits4.' ); console.log( 'Starting unpackbits5.' ); console.time('u5'); u = unpackbits5( testArray ); console.timeEnd('u5'); console.log( 'Finished unpackbits5.' );
  • © www.soinside.com 2019 - 2024. All rights reserved.