使用tf.py_func产生输入数据

问题描述 投票:3回答:2

Python版本3.6.3 =版本Tensorflow = 1.3.0

我在Keras工作,但现在想直接在TensorFlow工作。我想实现Kerasfit_generator的equivelent因此我不必须有在开始加载到内存中的所有训练数据,但根据需要为训练可以给它进入网络。下面的代码表示我尝试启动类似的东西,但如果我要对此都错了我很想知道在文档中我应该看看,我应该用什么关键字来搜索这个。

目前我的系统是基于一台发电机,通过SQLite数据库文件中读取到提取np.arrays,然后将它们转换成我想要的数据形状(时间序列与一个预测向前)。我想现在要迁移系统Tensorflow Datasets工作,并卡住在申请tf.py_func。这里是我在尝试,现在工作

import tensorflow as tf
import os
from tensorflow.contrib.data import Dataset, Iterator

import sqlite3
import pandas as pd
import numpy as np

LOOKBACK_ROWS = 600 
DATA_DIR = '/mnt/derived_data/processedData'

files = os.listdir(DATA_DIR)

def data_from_files(f):
    with sqlite3.connect(DATA_DIR + f) as conn:
        results = conn.execute("SELECT col1, col2, FROM tbl")
        col_names = [d[0] for d in results.description]
        arr = np.array(results.fetchall())

    num_obs = arr.shape[0] - LOOKBACK_ROWS + 1

    X = np.zeros((num_obs, LOOKBACK_ROWS, 1), dtype = np.float32)
    Y = np.zeros((num_obs, 1), dtype = np.float32)

    for i in range(num_obs):
        idx = i + LOOKBACK_ROWS - 1
        X[i , :, 0] = arr[(idx - LOOKBACK_ROWS + 1):(idx + 1), 0]
        Y[i, 0] = arr[idx, 1]

    return tf.convert_to_tensor(X, name = 'X'), tf.convert_to_tensor(Y, name = 'Y')

filenames = tf.constant(files)

dataset = Dataset.from_tensor_slices((filenames))

dataset = dataset.map(lambda filename: tuple(tf.py_func(
    data_from_files,
    [filename],
    [tf.float32, tf.float32])))


iterator     = Iterator.from_structure(dataset.output_types, dataset.output_shapes)
next_element = iterator.get_next()
dataset_init_op = iterator.make_initializer(dataset)

with tf.Session() as sess:
    sess.run(dataset_init_op)

    while True:
        try:
            elem = sess.run(next_element)
            print('Success')
        except tf.errors.OutOfRangeError:
            print('End of dataset.')
            break

在初始化运行正常,但后来当我开始会话,并运行,我得到以下错误:

  2017-10-16 16:58:45.227612: I tensorflow/core/common_runtime/gpu/gpu_device.cc:976] DMA: 0 
    2017-10-16 16:58:45.227615: I tensorflow/core/common_runtime/gpu/gpu_device.cc:986] 0:   Y 
    2017-10-16 16:58:45.227620: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1045] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:65:00.0)
    2017-10-16 16:58:45.276138: W tensorflow/core/framework/op_kernel.cc:1192] Invalid argument: TypeError: must be str, not bytes
    2017-10-16 16:58:45.276306: W tensorflow/core/framework/op_kernel.cc:1192] Invalid argument: TypeError: must be str, not bytes
         [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]]
    Traceback (most recent call last):
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1327, in _do_call
        return fn(*args)
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1306, in _run_fn
        status, run_metadata)
      File "/opt/python/3.6.3/lib/python3.6/contextlib.py", line 88, in __exit__
        next(self.gen)
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 466, in raise_exception_on_not_ok_status
        pywrap_tensorflow.TF_GetCode(status))
    tensorflow.python.framework.errors_impl.InvalidArgumentError: TypeError: must be str, not bytes
         [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]]
         [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[<unknown>, <unknown>], output_types=[DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](Iterator)]]

    During handling of the above exception, another exception occurred:

    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "/home/usr/code/nn/data_folder/pipeline.py", line 51, in <module>
        elem = sess.run(next_element)
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 895, in run
        run_metadata_ptr)
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1124, in _run
        feed_dict_tensor, options, run_metadata)
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1321, in _do_run
        options, run_metadata)
      File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1340, in _do_call
        raise type(e)(node_def, op, message)
    tensorflow.python.framework.errors_impl.InvalidArgumentError: TypeError: must be str, not bytes
         [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]]
         [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[<unknown>, <unknown>], output_types=[DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](Iterator)]]
    >>> python.el: native completion setup loaded
    >>> 

问题

(1)这似乎是完全是一个用例py_func但我是错了?如果没有,任何人都可以点我的一些资源,在深度比Tensorflow文档显示更多? (我也注意到在git的一个潜在相关的问题:https://github.com/tensorflow/tensorflow/issues/12396但包装与tuple一切都没有帮助我的修复)。

(2)什么是我应该按照一般流程,特别是当我要开始像一堆文件名和输出超过每文件名一个培训Example吗?

谢谢。

下面我改写了我的脚本,以便它可以是一个自包含的可运行的例子。我认为这个问题还是一样在上面的代码,但我repasting错误以及确认。

自给结合了来自@ mrry的回答改变运行的代码示例:

import tensorflow as tf
import os
import numpy as np

LOOKBACK_ROWS = 600 

arr = np.random.random_sample((2000, 2))
np.save("npfile.npy", arr)

def data_from_files(f):

    arr = np.load(f)
    num_obs = arr.shape[0] - LOOKBACK_ROWS + 1

    X = np.zeros((num_obs, LOOKBACK_ROWS, 1), dtype = np.float32)
    Y = np.zeros((num_obs, 1), dtype = np.float32)

    for i in range(num_obs):
        idx = i + LOOKBACK_ROWS - 1
        X[i , :, 0] = arr[(idx - LOOKBACK_ROWS + 1):(idx + 1), 0]
        Y[i, 0] = arr[idx, 1]

    return X, Y

files = ["npfile.npy"]
filenames = tf.constant(files)


# NOTE: In TensorFlow 1.4, `tf.contrib.data` is now `tf.data`.
dataset = tf.contrib.data.Dataset.from_tensor_slices(filenames)

# NOTE: In TensorFlow 1.4, the `tuple` is no longer needed.
dataset = dataset.map(lambda filename: tuple(tf.py_func(
    data_from_files,
    [filename],
    [tf.float32, tf.float32])))

# NOTE: If you only have one `Dataset`, you do not need to use
# `Iterator.from_structure()`.
iterator     = dataset.make_initializable_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    sess.run(iterator.initializer)

    while True:
        try:
            elem = sess.run(next_element)
            print('Success')
        except tf.errors.OutOfRangeError:
            print('End of dataset.')
            break

错误:

2017-10-16 18:30:44.143668: I tensorflow/core/common_runtime/gpu/gpu_device.cc:976] DMA: 0 
2017-10-16 18:30:44.143672: I tensorflow/core/common_runtime/gpu/gpu_device.cc:986] 0:   Y 
2017-10-16 18:30:44.143679: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1045] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:65:00.0)
2017-10-16 18:30:44.190852: W tensorflow/core/framework/op_kernel.cc:1192] Unknown: AttributeError: 'bytes' object has no attribute 'read'
2017-10-16 18:30:44.190959: W tensorflow/core/framework/op_kernel.cc:1192] Unknown: AttributeError: 'bytes' object has no attribute 'read'
     [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]]
Traceback (most recent call last):
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1327, in _do_call
    return fn(*args)
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1306, in _run_fn
    status, run_metadata)
  File "/opt/python/3.6.3/lib/python3.6/contextlib.py", line 88, in __exit__
    next(self.gen)
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 466, in raise_exception_on_not_ok_status
    pywrap_tensorflow.TF_GetCode(status))
tensorflow.python.framework.errors_impl.UnknownError: AttributeError: 'bytes' object has no attribute 'read'
     [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]]
     [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[<unknown>, <unknown>], output_types=[DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](Iterator)]]

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "demo.py", line 48, in <module>
    elem = sess.run(next_element)
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 895, in run
    run_metadata_ptr)
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1124, in _run
    feed_dict_tensor, options, run_metadata)
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1321, in _do_run
    options, run_metadata)
  File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1340, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.UnknownError: AttributeError: 'bytes' object has no attribute 'read'
     [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]]
     [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[<unknown>, <unknown>], output_types=[DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](Iterator)]]
tensorflow keras generator pipeline tensorflow-datasets
2个回答
2
投票

以你以相反的顺序问题:

什么是我应该按照一般流程,特别是当我要开始像一堆文件名和输出超过每文件名一个训练例子吗?

将一种元素成多,使用Dataset.flat_map(f)改造。这种转变可以让你定义一个函数f(x)映射单个元素x到嵌套Dataset对象,然后它需要扁平化嵌套数据集的照顾。

这似乎是完全是一个用例py_func但我是错了?

这是一个用例tf.py_func()但你的程序有一个微小的错误:tf.py_func()运算期待你的函数(data_from_files())返回与NumPy阵列,而不是tf.Tensor对象。简单地返回XY应该工作。


有了这两点回答,让我们来看看如何可以重写代码:

import tensorflow as tf
import os

import sqlite3
import pandas as pd
import numpy as np

LOOKBACK_ROWS = 600 
DATA_DIR = '/mnt/derived_data/processedData'

files = os.listdir(DATA_DIR)

def data_from_files(f):
    with sqlite3.connect(DATA_DIR + f) as conn:
        results = conn.execute("SELECT col1, col2, FROM tbl")
        col_names = [d[0] for d in results.description]
        arr = np.array(results.fetchall())

    num_obs = arr.shape[0] - LOOKBACK_ROWS + 1

    X = np.zeros((num_obs, LOOKBACK_ROWS, 1), dtype = np.float32)
    Y = np.zeros((num_obs, 1), dtype = np.float32)

    for i in range(num_obs):
        idx = i + LOOKBACK_ROWS - 1
        X[i , :, 0] = arr[(idx - LOOKBACK_ROWS + 1):(idx + 1), 0]
        Y[i, 0] = arr[idx, 1]

    return X, Y

filenames = tf.constant(files)

# NOTE: In TensorFlow 1.4, `tf.contrib.data` is now `tf.data`.
dataset = tf.contrib.data.Dataset.from_tensor_slices(filenames)

# NOTE: In TensorFlow 1.4, the `tuple` is no longer needed.
dataset = dataset.map(lambda filename: tuple(tf.py_func(
    data_from_files,
    [filename],
    [tf.float32, tf.float32])))

# NOTE: If you only have one `Dataset`, you do not need to use
# `Iterator.from_structure()`.
iterator     = dataset.make_initializable_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    sess.run(iterator.initializer)

    while True:
        try:
            elem = sess.run(next_element)
            print('Success')
        except tf.errors.OutOfRangeError:
            print('End of dataset.')
            break

1
投票

我有相同的错误(AttributeError的:“字节”对象没有属性“读”)当我使用tensorflow。我做这些事情:卸载numpy的,删除“LIB \站点包”约numpy的文件,重新安装numpy的。和错误消失了。也许一些错误发生在哪里更新numpy的。

© www.soinside.com 2019 - 2024. All rights reserved.