[我正在尝试使用TPU在colab上训练CNN Unet模型,但无法理解我在做什么。
我尝试寻找解决方案,但所有在线示例均在TensorFlow 1.x上运行
这是我的代码:
数据生成器
BATCH_SIZE_GLOBAL = 128
IMG_SIZE = 512
def train_data_generator():
data_gen_args = dict(rotation_range=30,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,
fill_mode='nearest',
brightness_range=[.8, 1.0])
image_datagen = ImageDataGenerator(**data_gen_args)
indices = range(0, len(train_data))
while True:
batch_indices = np.random.choice(indices, BATCH_SIZE_GLOBAL, replace=False)
x = np.zeros((BATCH_SIZE_GLOBAL, IMG_SIZE, IMG_SIZE, 3))
y = np.zeros((BATCH_SIZE_GLOBAL, IMG_SIZE, IMG_SIZE, 3))
for i, index in enumerate(batch_indices):
seed = random.randint(0, len(train_data))
img = image_datagen.random_transform(x=train_data[index], seed=seed)
mask = image_datagen.random_transform(x=train_masks[index], seed=seed)
img, mask = random_crop(img, mask, IMG_SIZE, IMG_SIZE)
x[i] = img / 255
y[i] = mask / 255
yield x,y
train_dataset = tf.data.Dataset.from_generator(generator_aug_train,(tf.float32, tf.float32), output_shapes= ((BATCH_SIZE_GLOBAL, IMG_SIZE, IMG_SIZE, 3), (BATCH_SIZE_GLOBAL, IMG_SIZE, IMG_SIZE, 3)))
我对验证数据集执行相同的操作。
模型
def get_model():
inputs = Input((512, 512, 3))
conv1 = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
conv1 = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(conv1)
pool1 = layers.MaxPooling2D(pool_size=(2, 2))(conv1)
conv2 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(pool1)
conv2 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(conv2)
pool2 = layers.MaxPooling2D(pool_size=(2, 2))(conv2)
conv3 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(pool2)
conv3 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(conv3)
pool3 = layers.MaxPooling2D(pool_size=(2, 2))(conv3)
conv4 = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(pool3)
conv4 = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(conv4)
drop4 = layers.Dropout(0.5)(conv4)
pool4 = layers.MaxPooling2D(pool_size=(2, 2))(drop4)
conv5 = layers.Conv2D(512, (3, 3), activation='relu', padding='same')(pool4)
conv5 = layers.Conv2D(512, (3, 3), activation='relu', padding='same')(conv5)
drop5 = layers.Dropout(0.5)(conv5)
up6 = layers.concatenate([layers.UpSampling2D(size=(2, 2))(drop5), drop4], axis=3)
conv6 = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(up6)
conv6 = layers.Conv2D(256, (3, 3), activation='relu', padding='same')(conv6)
up7 = layers.concatenate([layers.UpSampling2D(size=(2, 2))(conv6), conv3], axis=3)
conv7 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(up7)
conv7 = layers.Conv2D(128, (3, 3), activation='relu', padding='same')(conv7)
up8 = layers.concatenate([layers.UpSampling2D(size=(2, 2))(conv7), conv2], axis=3)
conv8 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(up8)
conv8 = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(conv8)
up9 = layers.concatenate([layers.UpSampling2D(size=(2, 2))(conv8), conv1], axis=3)
conv9 = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(up9)
conv9 = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(conv9)
conv10 = layers.Conv2D(3, (1, 1), activation='sigmoid')(conv9)
output = conv10
model = Model(inputs=[inputs], outputs=[output])
return model
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_host(resolver.master())
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.experimental.TPUStrategy(resolver)
with strategy.scope():
model = get_model()
model.compile(optimizer=optimizers.Adam(lr=1e-4), loss = bce_dice_loss, metrics = [jaccard_coef])
steps = len(train_data) // BATCH_SIZE_GLOBAL
validation_steps = len(test_data) // BATCH_SIZE_GLOBAL
history = model.fit(train_dataset, validation_data=val_dataset, epochs=EPOCHS, steps_per_epoch=steps, validation_steps=validation_steps)
我得到的错误是:
---------------------------------------------------------------------------
InternalError Traceback (most recent call last)
<ipython-input-18-a6bd71894524> in <module>()
---> 40 history = model.fit(train_dataset ...
12 frames
/usr/local/lib/python3.6/dist-packages/six.py in raise_from(value, from_value)
InternalError: Assigned device '/job:worker/replica:0/task:0/device:TPU:0' does not have registered OpKernel support for _Arg
[[{{node iteratorgetnext_iterator}}]] [Op:__inference_distributed_function_45676]
我在tensorflow 2.1.0-rc1(tf.keras)上使用KERAS
TF2.0及更高版本。您需要将TF版本降低到1.15、1.14(我的建议)或1.13]
读取here以获取TF版本支持。