如何从包含文件名的列表中加载tensorflow数据管道中不同目录的.npy文件?

问题描述 投票:0回答:1

我试图将numpy数组(x, 1, 768)和标签(1, 768)加载到tf.data中,我的代码如下。

import pandas as pdb
import pdb
import numpy as np
import os, glob
import tensorflow as tf
#from tensorflow import keras
from tensorflow.keras import layers, initializers
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K
from tensorflow.keras import layers
#from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences
from natsort import natsorted
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

#################################################################

#File Paths
text_path = 'data/featured/*'
tags_path ='data/encoded_tags/*'

text_files = natsorted(glob.glob(text_path)) # Load the array filenames
tags_files = natsorted(glob.glob(tags_path)) # Load the label filenames

text_train = text_files[:round(0.9*len(text_files))]
tags_train = tags_files[:round(0.9*len(tags_files))]

#Parameters

AUTO = tf.data.experimental.AUTOTUNE
index = 0
PADDING_LENGTH = 768
BATCH_LENGTH = 1
LEARNING_RATE = 0.01
OPTIMISER = 'ADAM'

            #Define the training parameters here.
#################################################################

#@tf.function
def load_files(filename1, filename2):
    tags = np.load(filename[1], allow_pickle=True)
    arr = np.load(filename[0], allow_pickle=True)

    # Perform padding and convert back to tensor

    return arr, tags

def load_dataset(text_files, tag_files):
    dataset = tf.data.Dataset.from_tensor_slices([text_files, tag_files])
    print(dataset)
    #dataset = dataset.map(load_files)
    #dataset = dataset.map(lambda x: tf.py_function(load_files, [x], tf.float64))
    dataset = dataset.map(map_func=load_files, num_parallel_calls=AUTO)
    return dataset

def get_batch_dataset(filename1, filename2):
    dataset = load_dataset(filename1, filename2)
    dataset = dataset.batch(BATCH_LENGTH)
    dataset = dataset.prefetch(AUTO).repeat()
    return dataset

def get_training_dataset():
    return get_batch_dataset(text_train, tags_train) 


dataset = get_batch_dataset(text_train, tags_train)

当我试图通过数组的文件名和标签来读取numpy数组时, 它抛出了以下错误:

TypeError: expected str, bytes or os.PathLike object, not Tensor

我已经试过了:

filename1.numpy() # doesnt work:
AttributeError: 'Tensor' object has no attribute 'numpy'
filename.as_string() # doesnt work either:
AttributeError: 'Tensor' object has no attribute 'as_string'

我只是需要把数组读成numpy数组,因为我需要填充它们,我试过用tf.io.read_file()来读取它们,但不知怎么的,它把数组搞乱了,返回的形状是(None,). 每个数组(对于一个唯一的文件名)的长度为x,如前所述,我需要执行填充,并输出一个固定大小的数组,以便给它一个神经网络。

先谢谢你的帮助。

python tensorflow keras tensorflow2.0 tensorflow-datasets
1个回答
0
投票

传递给我的函数是 dataset.map 将被跟踪并作为Tensorflow图执行。传递给函数的参数将是 Tensors. 这就是为什么你会得到

TypeError: expected str, bytes or os.PathLike object, not Tensor

如果你想让你的函数对字符串而不是Tensors进行操作,你可以使用 tf.py_function op:

def load_files_py(filename1, filename2):
    tags = np.load(filename1.numpy(), allow_pickle=True)
    arr = np.load(filename2.numpy(), allow_pickle=True)

    # Perform padding and convert back to tensor

    return arr, tags

def load_file(filename1, filename2):
    return tf.py_function(load_files_py, inp=[filename1, filename2], Tout=[arr_type, tags_type])

下面的代码展示了传递给函数的参数在有和没有的情况下的不同。py_function:

import tensorflow as tf

def load_py(a):
  # Arguments to py_function are eager tensors, so we can use `.numpy()` to get their string values.
  tf.print(type(a))  # <class 'tensorflow.python.framework.ops.EagerTensor'>
  tf.print(a.numpy())  # b'a'
  return a

def load(a):
  # `load` is executed in graph mode, so `a` and `b` are non-eager Tensors.
  tf.print(type(a))  # <class 'tensorflow.python.framework.ops.Tensor'>
  return tf.py_function(load_py, inp=[a], Tout=[tf.string])

ds = tf.data.Dataset.from_tensor_slices((["a", "b"]))
ds = ds.map(load)
next(iter(ds))

https:/colab.research.google.comdrive1Tr04ykdBGx01uCMUHdyBLXV4VQMi-6dU。

© www.soinside.com 2019 - 2024. All rights reserved.