尽管验证准确度很高，但为什么我的神经网络对属于某一类的测试图像预测出错误的类标签？

Question

我正在使用 Inception v4 模型在 A、B 和 C 3 个类别上训练分类器，每个类别在训练数据集中大约有 900 张图像，在验证集中有 80 张图像。我运行了 200 个时期的训练代码，批量大小为 8。我得到了超过 99% 的平均验证准确率，并且损失非常低：-

Epoch 199/200
303/303 [==============================] - 53s 174ms/step - loss: 0.0026 - accuracy: 0.9996 - val_loss: 5.1226e-04 - val_accuracy: 1.0000
Epoch 200/200
303/303 [==============================] - 53s 176ms/step - loss: 0.0019 - accuracy: 1.0000 - val_loss: 0.1079 - val_accuracy: 0.9750

当我在验证集目录 A 中的图像上运行测试代码时，它预测 80% 的图像为 A 类，20% 为 C 类，B 类中什么都没有。与 C 类相同（80% 为 C 类） C，20% 作为 A)。在目录 B 上，所有图像都被预测为 A 类或 C 类。在所有三个测试用例中，尽管验证精度很高并且使用完全相同的图像，但测试程序没有将任何图像分类为 B 类用于训练时验证的目录（后者也让我相信它主要不是由过度拟合引起的）。

这是目录 B 上测试程序的输出：

25/25 [==============================] - 8s 186ms/step - loss: 0.0212 - accuracy: 0.9963
['loss', 'accuracy']
[0.02124088630080223, 0.9963099360466003]
Testing images located in val/B/
[[6.2504888e-01 8.8258091e-08 3.7495103e-01]]
A:62.5%
[[8.8602149e-01 1.3459101e-05 1.1396510e-01]]
A:88.6%
[[4.7189465e-01 4.0863368e-05 5.2806443e-01]]
C:52.81%
[[1.0370950e-01 2.7608112e-07 8.9629024e-01]]
C:89.63%
[[7.1212035e-01 3.3269991e-06 2.8787634e-01]]
A:71.21%

等等。

我什至尝试将线

img = np.expand_dims(test_image, axis=0)

除以255，如我在其他地方问过的另一个问题所述。在那种情况下是成功的，但在这里却并非如此。

这是我的训练代码：

def create_inception_v4(nb_classes, load_weights, checkpoint_path):

    init = Input((299,299, 3))

    x = inception_stem(init)

    # 4 x Inception A
    for i in range(4):
        x = inception_A(x)

    # Reduction A
    x = reduction_A(x)

    # 7 x Inception B
    for i in range(7):
        x = inception_B(x)

    # Reduction B
    x = reduction_B(x)

    # 3 x Inception C
    for i in range(3):
        x = inception_C(x)

    # Average Pooling
    x = AveragePooling2D((8, 8))(x)

    # Dropout - Use 0.2, as mentioned in official paper. 
    x = Dropout(0.2)(x)
    x = Flatten()(x)

    # Output
    out = Dense(nb_classes, activation='softmax')(x)

    model = Model(init, out, name='Inception-v4')

    if load_weights:
        weights = checkpoint_path
        model.load_weights(weights, by_name=True)
        print("Model weights loaded.")
 
    return model





def train(args,check,checkpoint_path,network_name="inceptionv4"):
    n_gpus=int(args['gpus'])      
   
    sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

    datagen=ImageDataGenerator(rescale=1/255,
                rotation_range=40,
                width_shift_range=0.1,
                height_shift_range=0.1,
                shear_range=0.1,
                zoom_range=0.1,
                horizontal_flip=True,
                fill_mode='nearest',
                samplewise_std_normalization=True)

    val_datagen = ImageDataGenerator(rescale=1/255)

    batch_size = int(args["batch_size"])

    train_generator = datagen.flow_from_directory(train_dir,target_size=(299,299),class_mode="categorical", batch_size=batch_size)
    val_gen = datagen.flow_from_directory(val_dir,target_size=(299,299),class_mode="categorical", batch_size=batch_size)

    mc = keras.callbacks.ModelCheckpoint(f"{network_name}_checkpoints/{network_name}.h5", save_weights_only=True, save_best_only=True)

    tensorboard = TensorBoard(log_dir="{}/{}".format(args["log_dir"], time()))

    validation_steps = 10


    model = create_inception_v4(int(args["num_classes"]),check,checkpoint_path)
    model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.SGD(learning_rate=float(args['learning_rate']), decay=1e-6, momentum=0.9, nesterov=True), metrics=["accuracy"])   

    counter = Counter(train_generator.classes)                          
    max_val = float(max(counter.values()))       
    class_weights = {class_id : max_val/num_images for class_id, num_images in counter.items()}    
   
    hist = model.fit(train_generator,epochs=num_epochs,verbose=True,validation_data=val_gen,validation_steps=validation_steps,callbacks=[mc, tensorboard], class_weight=class_weights)
    model.save(f"checkpoints/{network_name}_{num_epochs}epochs.h5")

这是我的测试代码：

def test_model(test_dir, num_epochs,class_names, network_name="inceptionv4",):

    model=load_model(f'checkpoints/{network_name}_{num_epochs}epochs.h5')


    datagen=ImageDataGenerator(rescale=1/255,
                rotation_range=40,
                width_shift_range=0.1,
                height_shift_range=0.1,
                shear_range=0.1,
                zoom_range=0.1,
                horizontal_flip=True,
                fill_mode='nearest',         
                samplewise_std_normalization=True)

    val_datagen = ImageDataGenerator(rescale=1/255)
    val_dir = "val/"
    val_gen = datagen.flow_from_directory(val_dir,target_size=(299,299),class_mode="categorical")


    test_accuracy=model.evaluate(val_gen,steps=25)
    print(model.metrics_names)
    print(test_accuracy)

    
    img_width, img_height = 299, 299

    print(f"Testing images located in {test_dir}")
    counter = 0
    results_dict = {}
    start_time = time.time()
    
    for filename_img in os.listdir(test_dir):
        counter += 1
        filename = os.path.join(test_dir,filename_img)
        img = image.load_img(filename, target_size=(img_width, img_height))
        test_image = image.img_to_array(img)
        test_image.shape
        img = np.expand_dims(test_image, axis=0)/255
        classes = model.predict(img, batch_size=10)

        print(classes)
        predicted_class = class_names[np.argmax(classes)]

        if predicted_class not in results_dict.keys():
            results_dict[predicted_class] = 1
        else:
            results_dict[predicted_class] += 1

        print(f"{predicted_class}:{round(np.amax(classes)*100,2)}%")
        if counter % 100 == 0:
            print(f"{counter} files processed!")

    time_taken = time.time() - start_time
    time_taken = round(time_taken,2)
    print(f"{counter} images processed in {time_taken} seconds, at a rate of {round(counter/time_taken,2)} images per second.")
    
    for predicted_class in results_dict.keys():
        print(f"{predicted_class} = {results_dict[predicted_class]} predictions")

我做错了什么？

编辑 1-我尝试通过添加

class_weight

参数来解决不平衡的类，如编辑的代码中所示。仍然无法预测 B 类。我什至尝试使用

val_datagen

而不是

datagen

，这导致了更糟糕的结果。

编辑2-现在我将整个文件夹复制到其他地方，然后删除B类并保留A类和C类。我训练了模型，再次获得了非常高的训练精度，现在我的测试程序只能预测C类而不是A类.我有一种感觉，我在我的 test.py 代码中犯了一个真的愚蠢的错误。

Answer 1

这是一个非常令人沮丧的错误。我意识到我在整个目录的

model.evaluate()

上获得了很高的验证准确性，但在单个图像上的

model.predict()

上却不然。这是因为用于训练的图像增强技术也用于验证，但不适用于作为模型输入的单个图像。

在这种情况下，我意识到

samplewise_std_normalization

没有应用于测试图像。所以我使用了标准化函数，受到这个答案-

test_image = datagen.standardize(test_image)

的启发，现在我的模型运行得很好。完整的 test.py 代码如下：

def test_model(test_dir, num_epochs,class_names, network_name="inceptionv4",):

    model=load_model(f'checkpoints/{network_name}_{num_epochs}epochs.h5')


    datagen=ImageDataGenerator(rescale=1/255,
                rotation_range=40,
                width_shift_range=0.1,
                height_shift_range=0.1,
                shear_range=0.1,
                zoom_range=0.1,
                horizontal_flip=True,
                fill_mode='nearest',         
                samplewise_std_normalization=True)

    val_datagen = ImageDataGenerator(rescale=1/255)
    val_dir = "val/"
    val_gen = datagen.flow_from_directory(val_dir,target_size=(299,299),class_mode="categorical")


    test_accuracy=model.evaluate(val_gen,steps=25)
    print(model.metrics_names)
    print(test_accuracy)

    
    img_width, img_height = 299, 299

    print(f"Testing images located in {test_dir}")
    counter = 0
    results_dict = {}
    start_time = time.time()
    
    for filename_img in os.listdir(test_dir):
        counter += 1
        filename = os.path.join(test_dir,filename_img)
        img = image.load_img(filename, target_size=(img_width, img_height))
        test_image = image.img_to_array(img)
        test_image = np.expand_dims(test_image, axis=0)  
        # Don't divide by 255, this is taken care of by the standardize function
        test_image = datagen.standardize(test_image)
        classes = model.predict(test_image, batch_size=10)

        print(classes)
        predicted_class = class_names[np.argmax(classes)]

        if predicted_class not in results_dict.keys():
            results_dict[predicted_class] = 1
        else:
            results_dict[predicted_class] += 1

        print(f"{predicted_class}:{round(np.amax(classes)*100,2)}%")
        if counter % 100 == 0:
            print(f"{counter} files processed!")

    time_taken = time.time() - start_time
    time_taken = round(time_taken,2)
    print(f"{counter} images processed in {time_taken} seconds, at a rate of {round(counter/time_taken,2)} images per second.")
    
    for predicted_class in results_dict.keys():
        print(f"{predicted_class} = {results_dict[predicted_class]} predictions")

Answer 2

我训练了一个识别模型，它将识别三种不同类别的图像。我的脚本的问题是，每当我尝试像使用测试集一样预测大量图像的类别时，它都能正常工作，并且可以正确预测 91 个图像中的 77 个图像，但另一方面，当我创建一个预测函数用于逐一预测每张图像的类别，它只正确预测了 47 张图像。你能告诉我我在这方面犯了什么错误吗？

尽管验证准确度很高，但为什么我的神经网络对属于某一类的测试图像预测出错误的类标签？

问题描述投票：0回答：2

2个回答

最新问题

尽管验证准确度很高，但为什么我的神经网络对属于某一类的测试图像预测出错误的类标签？

问题描述 投票：0回答：2

2个回答

最新问题

问题描述投票：0回答：2