在@ tensorflow / tfjs节点,GPU太多RAM使用相对较小的神经网

问题描述 投票:0回答:2

我试图用tensorflow.js培养更多然后10.000输入单位的模型与GPU backend.When这样的内存填补了我的RAM和16GB,并退出program.I认为,这是不是很,因为其他人至少有640×480的图像训练这将是〜30万输入units.Maybe它很重要,我只用一个维度,但在任何地方,这里是我的训练功能:

 async function learn() {
        console.log("learn");
        const tf = require('@tensorflow/tfjs');

        // Load the binding:
        require('@tensorflow/tfjs-node-gpu');  // Use '@tensorflow/tfjs-node-gpu' if running with GPU. / for cpu backend: require('@tensorflow/tfjs-node');

        const learningRate = 0.00001;
        const optimizer = tf.train.sgd(learningRate);

        // Train a simple model:
        const model = tf.sequential();
        model.add(tf.layers.dense({units: 76800, activation: 'linear', inputShape: [76800]})); //320x240

        model.add(tf.layers.dense({units: 1200, activation: 'relu'}));

        for (var i = 0; i < 8; i++) {
            model.add(tf.layers.dense({units: 800, activation: 'relu'}));
        }

        model.add(tf.layers.dense({units: 60, activation: 'relu'}));

        model.compile({optimizer: optimizer, loss: 'meanSquaredError'});

        console.log("in_tensor_sum.length" + in_tensor_sum.length);
        for (var for_ep = 0; for_ep < 100; for_ep++) {
            for (var i = 0; i < in_tensor_sum.length; i++) {

                var pre_xs = [];
                var pre_ys = [];

                pre_xs.push(in_tensor_sum[i]);
                pre_ys.push(out_tensor_sum[i]);


                var xs = tf.tensor(pre_xs);
                var ys = tf.tensor(pre_ys);


                await model.fit(xs, ys, {
                    epochs: 1, batchSize: 1,
                    callbacks: {
                        onEpochEnd: async (epoch, log) => {
                            console.log(`Epoch ${for_ep}: loss = ${log.loss}`);
                        },
                        onTrainEnd: async () => {

                        }
                    }
                });

            }
        }

     }  

正如你可以看到我已经尝试过批量大小减小到1,但似乎没有关系。下面是该程序的堆栈跟踪:

cpu backend was already registered. Reusing existing backend
2019-02-10 08:25:46.566395: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2019-02-10 08:25:46.657945: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:964] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-02-10 08:25:46.658444: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties: 
name: GeForce GTX 1060 6GB major: 6 minor: 1 memoryClockRate(GHz): 1.759
pciBusID: 0000:01:00.0
totalMemory: 5.93GiB freeMemory: 5.65GiB
2019-02-10 08:25:46.658458: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0
2019-02-10 08:25:47.174668: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-02-10 08:25:47.174690: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988]      0 
2019-02-10 08:25:47.174695: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0:   N 
2019-02-10 08:25:47.175247: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 5419 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1060 6GB, pci bus id: 0000:01:00.0, compute capability: 6.1)
in_tensor_sum.length41
Epoch 1 / 1
2019-02-10 08:26:54.416207: W tensorflow/core/framework/allocator.cc:122] Allocation of 6400000000 exceeds 10% of system memory.
2019-02-10 08:27:06.570627: W tensorflow/core/common_runtime/bfc_allocator.cc:267] Allocator (GPU_0_bfc) ran out of memory trying to allocate 5.96GiB.  Current allocation summary follows.
2019-02-10 08:27:06.570704: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (256):   Total Chunks: 5, Chunks in use: 3. 1.2KiB allocated for chunks. 768B in use in bin. 484B client-requested in use in bin.
2019-02-10 08:27:06.570730: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (512):   Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2019-02-10 08:27:06.570756: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (1024):  Total Chunks: 1, Chunks in use: 1. 1.2KiB allocated for chunks. 1.2KiB in use in bin. 1.0KiB client-requested in use in bin.
2019-02-10 08:27:06.570778: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (2048):  Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2019-02-10 08:27:06.570806: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (4096):  Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2019-02-10 08:27:06.570827: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (8192):  Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2019-02-10 08:27:06.570847: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (16384):     Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2019-02-10 08:27:06.570867: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (32768):     Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2019-02-10 08:27:06.570888: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (65536):     Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2019-02-10 08:27:06.570915: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (131072):    Total Chunks: 2, Chunks in use: 2. 312.5KiB allocated for chunks. 312.5KiB in use in bin. 312.5KiB client-requested in use in bin.
2019-02-10 08:27:06.570937: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (262144):    Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2019-02-10 08:27:06.570958: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (524288):    Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2019-02-10 08:27:06.570978: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (1048576):   Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2019-02-10 08:27:06.571019: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (2097152):   Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2019-02-10 08:27:06.571038: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (4194304):   Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2019-02-10 08:27:06.571059: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (8388608):   Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2019-02-10 08:27:06.571080: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (16777216):  Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2019-02-10 08:27:06.571100: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (33554432):  Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2019-02-10 08:27:06.571121: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (67108864):  Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2019-02-10 08:27:06.571141: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (134217728):     Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2019-02-10 08:27:06.571163: I tensorflow/core/common_runtime/bfc_allocator.cc:597] Bin (268435456):     Total Chunks: 1, Chunks in use: 0. 5.29GiB allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2019-02-10 08:27:06.571186: I tensorflow/core/common_runtime/bfc_allocator.cc:613] Bin for 5.96GiB was 256.00MiB, Chunk State: 
2019-02-10 08:27:06.571213: I tensorflow/core/common_runtime/bfc_allocator.cc:619]   Size: 5.29GiB | Requested Size: 0B | in_use: 0, prev:   Size: 256B | Requested Size: 240B | in_use: 1
2019-02-10 08:27:06.571236: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x7facd6000000 of size 256
2019-02-10 08:27:06.571254: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x7facd6000100 of size 1280
2019-02-10 08:27:06.571271: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Free  at 0x7facd6000600 of size 256
2019-02-10 08:27:06.571289: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x7facd6000700 of size 160000
2019-02-10 08:27:06.571306: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x7facd6027800 of size 256
2019-02-10 08:27:06.571323: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x7facd6027900 of size 160000
2019-02-10 08:27:06.571339: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Free  at 0x7facd604ea00 of size 256
2019-02-10 08:27:06.571357: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Chunk at 0x7facd604eb00 of size 256
2019-02-10 08:27:06.571373: I tensorflow/core/common_runtime/bfc_allocator.cc:632] Free  at 0x7facd604ec00 of size 5682435072
2019-02-10 08:27:06.571390: I tensorflow/core/common_runtime/bfc_allocator.cc:638]      Summary of in-use Chunks by size: 
2019-02-10 08:27:06.571408: I tensorflow/core/common_runtime/bfc_allocator.cc:641] 3 Chunks of size 256 totalling 768B
2019-02-10 08:27:06.571428: I tensorflow/core/common_runtime/bfc_allocator.cc:641] 1 Chunks of size 1280 totalling 1.2KiB
2019-02-10 08:27:06.571448: I tensorflow/core/common_runtime/bfc_allocator.cc:641] 2 Chunks of size 160000 totalling 312.5KiB
2019-02-10 08:27:06.571467: I tensorflow/core/common_runtime/bfc_allocator.cc:645] Sum Total of in-use chunks: 314.5KiB
2019-02-10 08:27:06.571491: I tensorflow/core/common_runtime/bfc_allocator.cc:647] Stats: 
Limit:                  5682757632
InUse:                      322048
MaxInUse:                   322560
NumAllocs:                      10
MaxAllocSize:               160000

2019-02-10 08:27:06.571523: W tensorflow/core/common_runtime/bfc_allocator.cc:271] *___________________________________________________________________________________________________
(node:2186) UnhandledPromiseRejectionWarning: Error: Invalid TF_Status: 13
Message: Dst tensor is not initialized.
    at NodeJSKernelBackend.getInputTensorIds (/home/test/node_modules/@tensorflow/tfjs-node-gpu/dist/nodejs_kernel_backend.js:146:38)
    at NodeJSKernelBackend.executeSingleOutput (/home/test/node_modules/@tensorflow/tfjs-node-gpu/dist/nodejs_kernel_backend.js:186:73)
    at NodeJSKernelBackend.reshape (/home/test/node_modules/@tensorflow/tfjs-node-gpu/dist/nodejs_kernel_backend.js:927:21)
    at environment_1.ENV.engine.runKernel.$x (/home/test/node_modules/@tensorflow/tfjs/node_modules/@tensorflow/tfjs-core/dist/ops/array_ops.js:199:83)
    at /home/test/node_modules/@tensorflow/tfjs/node_modules/@tensorflow/tfjs-core/dist/engine.js:129:26
    at Engine.scopedRun (/home/test/node_modules/@tensorflow/tfjs/node_modules/@tensorflow/tfjs-core/dist/engine.js:101:23)
    at Engine.runKernel (/home/test/node_modules/@tensorflow/tfjs/node_modules/@tensorflow/tfjs-core/dist/engine.js:127:14)
    at reshape_ (/home/test/node_modules/@tensorflow/tfjs/node_modules/@tensorflow/tfjs-core/dist/ops/array_ops.js:199:37)
    at Object.reshape (/home/test/node_modules/@tensorflow/tfjs/node_modules/@tensorflow/tfjs-core/dist/ops/operation.js:23:29)
    at Variable.Tensor.reshape (/home/test/node_modules/@tensorflow/tfjs/node_modules/@tensorflow/tfjs-core/dist/tensor.js:302:26)
(node:2186) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). (rejection id: 1)
(node:2186) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not handled will terminate the Node.js process with a non-zero exit code.

编辑:RAM的填充了发生在这里(填充退出之前我16GB RAM):

model.add(tf.layers.dense({units: 76800, activation: 'linear', inputShape: [76800]})); //320x240

所以,我不明白甚至model.compile(....);

编辑2:好了,当我创建只是下面的代码空模型:

var tf = require('@tensorflow/tfjs-node-gpu');

var learningRate = 0.00005;//0.0005
learn_start(learningRate);

async function learn_start(rate) {
    console.log("learn");

    var learningRate = rate;
    var optimizer = tf.train.sgd(learningRate);

    // Train a simple model:
    var model = tf.sequential();

    model.add(tf.layers.dense({units: 78600, activation: 'linear', inputShape: [78600]})); //320x240 
    model.add(tf.layers.dense({units: 1200, activation: 'relu'}));        
    for (var i = 0; i < 8; i++) {
        model.add(tf.layers.dense({units: 800, activation: 'relu'}));
    }  
    model.add(tf.layers.dense({units: 60, activation: 'relu'}));

    model.compile({optimizer: optimizer, loss: 'meanSquaredError'});   
}

我得到以下错误:

(node:14684) UnhandledPromiseRejectionWarning: RangeError: Invalid typed array length: 6177960000
    at new Float32Array (<anonymous>)
    at Object.getArrayFromDType (/home/test/node_modules/@tensorflow/tfjs/node_modules/@tensorflow/tfjs-core/dist/util.js:262:18)
    at new TensorBuffer (/home/test/node_modules/@tensorflow/tfjs/node_modules/@tensorflow/tfjs-core/dist/tensor.js:67:28)
    at buffer (/home/test/node_modules/@tensorflow/tfjs/node_modules/@tensorflow/tfjs-core/dist/ops/array_ops.js:445:12)
    at truncatedNormal_ (/home/test/node_modules/@tensorflow/tfjs/node_modules/@tensorflow/tfjs-core/dist/ops/array_ops.js:107:15)
    at Object.truncatedNormal (/home/test/node_modules/@tensorflow/tfjs/node_modules/@tensorflow/tfjs-core/dist/ops/operation.js:23:29)
    at GlorotNormal.VarianceScaling.apply (/home/test/node_modules/@tensorflow/tfjs/node_modules/@tensorflow/tfjs-layers/dist/initializers.js:260:32)
    at Dense.Layer.addWeight (/home/test/node_modules/@tensorflow/tfjs/node_modules/@tensorflow/tfjs-layers/dist/engine/topology.js:576:37)
    at Dense.build (/home/test/node_modules/@tensorflow/tfjs/node_modules/@tensorflow/tfjs-layers/dist/layers/core.js:131:32)
    at /home/test/node_modules/@tensorflow/tfjs/node_modules/@tensorflow/tfjs-layers/dist/engine/topology.js:410:23
(node:14684) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). (rejection id: 1)
(node:14684) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not handled will terminate the Node.js process with a non-zero exit code.
node.js tensorflow out-of-memory tensorflow.js
2个回答
1
投票

您没有任何处置的张量。每次迭代新张量创建并保存在内存中,除非你处理它们使用.dispose()或者使用它们内部tf.tidy()。尝试tf.memory()得到的是如何存在的许多张量的概述。


0
投票

纵观错误,你的问题是不是在你的问题表示的RAM内存消耗。你而面临尺寸匹配问题。

模型的inputShape应与传递到model.predict功能数据的形状。

热门问题
推荐问题
最新问题