Geringe Genauigkeit bei der Verwendung von Bildtransformatoren zur Bildklassifizierung

Guest · Post by **Guest** » 14 Jan 2025, 11:08

Ich habe die Funktionsweise eines Vision-Transformators gelernt, konnte ihn jedoch zunächst nicht zum Laufen bringen (den ViT von Grund auf neu erstellen). Aber irgendwie ist es mir gelungen, einen Code zu verschlüsseln, der eine sehr geringe Genauigkeit aufweist (3 %).
Ich habe versucht, mich durch ihn zu chatten, konnte es aber scheinbar nicht herausfinden.
Ich habe das gleiche Verfahren verwendet, das ich bereits verwendet habe ein zuvor erstelltes Vit auf einem anderen Datensatz (fashionMnist), der eine sehr gute Genauigkeit lieferte und versuchte, es auf diesem Datensatz abzubilden.

Code: Select all

from google.colab import files
files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d mahdavi1202/skin-cancer

import zipfile
import os

with zipfile.ZipFile('skin-cancer.zip', 'r') as zip_ref:
zip_ref.extractall('skin-cancer')

os.listdir('skin-cancer')
import pandas as pd
import shutil
os.mkdir("/content/all_images")
source_dir="skin-cancer/imgs_part_1/imgs_part_1"
file_names = os.listdir(source_dir)
for file_name in file_names:
shutil.move(os.path.join(source_dir, file_name), "/content/all_images")

source_dir="skin-cancer/imgs_part_2/imgs_part_2"
file_names = os.listdir(source_dir)
for file_name in file_names:
shutil.move(os.path.join(source_dir, file_name), "/content/all_images")

source_dir="skin-cancer/imgs_part_3/imgs_part_3"
file_names = os.listdir(source_dir)
for file_name in file_names:
shutil.move(os.path.join(source_dir, file_name), "/content/all_images")

data = pd.read_csv("skin-cancer/metadata.csv")
data['full_link'] = '/kaggle/working/full_images/' + data['img_id']
data.info()
import matplotlib.pyplot as plt
fig, ax1 = plt.subplots(1, 1, figsize= (10, 5))
data['diagnostic'].value_counts().plot(kind='bar', ax=ax1)

diagnostic_classes = {0:'BCC', 1 : 'ACK', 2 : 'NEV', 3 : 'SEK', 4 : 'SCC', 5: 'MEL'}

# a function for encoding classes
def create_class(X):
if X == 'BCC':
return 0
elif X =='ACK':
return 1
elif X == 'NEV':
return 2
elif X == 'SEK':
return 3
elif X == 'SCC':
return 4
elif X == 'MEL':
return 5
else:
print('error class')
data['encoded_class'] = data['diagnostic'].apply(create_class)
data.drop(['diagnostic'], axis = 1, inplace = True)
data.sort_values(by ='patient_id', ascending = True, inplace = True, ignore_index = True)
data.info()
data.drop([ 'biopsed','patient_id','img_id','lesion_id','smoke', 'drink', 'background_father', 'background_mother', 'pesticide', 'gender', 'skin_cancer_history',
'cancer_history', 'has_piped_water', 'has_sewage_system', 'fitspatrick', 'diameter_1', 'diameter_2'], axis = 1, inplace = True)
data.info()

from sklearn.utils import shuffle
import tensorflow as tf
import numpy as np
IMG_SIZE = 32,32
BATCH_SIZE = 256
SEED = 55
AUTO = tf.data.AUTOTUNE
train_data = data[:2000]
test_data = data[2000:]
test_data = shuffle(test_data, random_state = SEED).reset_index(drop = True)

print('train  ->', train_data.shape)
print('test  ->', test_data.shape)
counts = np.bincount(train_data['encoded_class'])

weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]
weight_for_2 = 1.0 / counts[2]
weight_for_3 = 1.0 / counts[3]
weight_for_4 = 1.0 / counts[4]
weight_for_5 = 1.0 / counts[5]

class_weight = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2, 3: weight_for_3, 4: weight_for_4, 5:  weight_for_5}
class_weight
data.info()
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import cv2
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

class ClassToken(Layer):
def __init__(self, **kwargs):
super().__init__(**kwargs)

def build(self, input_shape):
self.cls = self.add_weight(             #adding a trainable paramter to a custom layer
name="cls",                         #name of the weight
shape=(1, 1, input_shape[-1]),
initializer="zeros",
trainable=True,
)

def call(self, x):
batch_size = tf.shape(x)[0]
cls = tf.tile(self.cls, [batch_size, 1, 1])
x = tf.concat([cls, x], axis=1)
return x

def mlp(x, cf):
x = Dense(cf["mlp_dim"], activation="gelu")(x)
x = Dropout(cf["dropout_rate"])(x)
x = Dense(cf["hidden_dim"])(x)
x = Dropout(cf["dropout_rate"])(x)
return x

def transformer_encoder(x, cf):
skip_1 = x
x = LayerNormalization()(x)
x = MultiHeadAttention(
num_heads=cf["num_heads"], key_dim=cf["hidden_dim"]
)(x, x)
x = Add()([x, skip_1])

skip_2 = x
x = LayerNormalization()(x)
x = mlp(x, cf)
x = Add()([x, skip_2])

return x

def ViT(cf):

input_shape = (cf["num_patches"], cf["patch_size"]*cf["patch_size"]*cf["num_channels"])
inputs = Input(input_shape)

patch_embed = Dense(cf["hidden_dim"])(inputs)

positions = tf.range(start=0, limit=cf["num_patches"], delta=1)
pos_embed = Embedding(input_dim=cf["num_patches"], output_dim=cf["hidden_dim"])(positions)
embed = patch_embed + pos_embed

x = ClassToken()(embed)

for _ in range(cf["num_layers"]):
x = transformer_encoder(x, cf)

x = LayerNormalization()(x)
x = x[:, 0, :]
x = Dense(cf["num_classes"], activation="softmax")(x)

model = Model(inputs, x)
return model

def preprocess_image(image, patch_size, target_size=(32, 32)): # IMG_SIZE is (32, 32)
# Decode the image within the function if it's a byte string
if isinstance(image, bytes):
image = tf.image.decode_jpeg(image, channels=3)  # Decode byte string to tensor

# Ensure image has 3 dimensions (height, width, channels) before resizing
image = tf.ensure_shape(image, [None, None, 3])

image = tf.image.resize(image, target_size)
image = tf.cast(image, tf.float32) / 255.0
# Remove tf.expand_dims(image, axis=0) to avoid extra dimension

patches = tf.image.extract_patches(
images=image,
sizes=[1, patch_size, patch_size, 1],
strides=[1, patch_size, patch_size, 1],
rates=[1, 1, 1, 1],
padding='VALID'
)
# Reshape to (num_patches, patch_size * patch_size * num_channels)
# This should match the input shape expected by your ViT model
patches = tf.reshape(patches, [-1, patch_size * patch_size * 3])
return patches

def create_dataset(df):
image_paths = df['full_link'].values
labels = df['encoded_class'].values

dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))
# Pass the raw file content to preprocess_image
dataset = dataset.map(lambda path, label: (preprocess_image(tf.io.read_file(tf.strings.as_string(path)), cf["patch_size"]), label))
dataset = dataset.batch(BATCH_SIZE)
return dataset

image_dir = "all_images"

data.info()
import tensorflow as tf

def load_and_preprocess_image(image_path, label):
"""Loads an image from a file path, preprocesses it, and returns a tuple of (image, label).

Args:
image_path: The path to the image file.
label:  The label associated with the image.

Returns:
A tuple containing the preprocessed image and label.
"""
img = tf.io.read_file(image_path)
img = tf.image.decode_jpeg(img, channels=3)
img = tf.image.resize(img, IMG_SIZE)  # Assuming IMG_SIZE is defined as (32, 32)
img = tf.cast(img, tf.float32) / 255.0  # Normalize pixel values
# Convert the image into patches here
img = preprocess_image(img, patch_size=cf["patch_size"]) # patch_size is defined in the cf dictionary
return img, label

def augment(image, label):
"""Applies data augmentation to an image.

Args:
image: The image to augment.
label: The label associated with the image.

Returns:
A tuple containing the augmented image and label.
"""
# You can add more augmentation techniques as needed
image = tf.image.random_flip_left_right(image)
image = tf.image.random_flip_up_down(image)
image = tf.image.random_brightness(image, max_delta=0.2)  # Adjust brightness
image = tf.image.random_contrast(image, lower=0.8, upper=1.2)  # Adjust contrast
# Other augmentations: random rotation, cropping, etc.
return image, label
IMG_SIZE = (32, 32)
BATCH_SIZE = 256
SEED = 55
AUTO = tf.data.AUTOTUNE

# Create file paths (assuming all images are in /content/all_images)
all_image_files = os.listdir("/content/all_images")
image_paths = ["/content/all_images/" + filename for filename in all_image_files]

# Create labels based on file order and original DataFrame order
labels = []
filenames_no_ext = [f.split('.')[0] for f in all_image_files]

# Ensure correct mapping even after shuffling filenames
for filename in filenames_no_ext:
matching_row = data[data['full_link'].str.contains(filename)] #using full_link which wasn't dropped
if not matching_row.empty:
labels.append(matching_row['encoded_class'].values[0])
else:
print(f"Warning: No matching entry found for file: {filename}")
#Handle missing images in the CSV. One option is to skip or assign a default label
#labels.append(-1) #Example of assigning -1 as a missing label

labels = np.array(labels)
image_paths = np.array(image_paths)

# Shuffle image paths and labels together
image_paths, labels = shuffle(image_paths, labels, random_state=SEED)

# Split data
train_image_paths = image_paths[:2000]
train_labels = labels[:2000]
test_image_paths = image_paths[2000:]
test_labels = labels[2000:]

train_dataset = tf.data.Dataset.from_tensor_slices((train_image_paths, train_labels))

train_dataset = (
train_dataset.map(load_and_preprocess_image, num_parallel_calls=AUTO)
.map(augment, num_parallel_calls=AUTO) # Augmentation
.shuffle(buffer_size=len(train_image_paths))
.batch(BATCH_SIZE)
.prefetch(AUTO)
)
test_dataset = tf.data.Dataset.from_tensor_slices((test_image_paths, test_labels))
test_dataset = (
test_dataset.map(load_and_preprocess_image, num_parallel_calls=AUTO)
.batch(BATCH_SIZE)
.prefetch(AUTO)
)

# ...  (Your ViT model definition remains the same) ...

cf = {
"num_patches": (IMG_SIZE[0] // 4) * (IMG_SIZE[1] // 4), #adjust patch size if needed
"patch_size": 4,
"num_channels": 3,
"hidden_dim": 64,
"mlp_dim": 128,
"num_heads": 4,
"num_layers": 4,
"dropout_rate": 0.1,
"num_classes": 6,
}

model = ViT(cf)

# Compile the model
model.compile(
loss="sparse_categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"],
)

# Train the model
history = model.fit(
train_dataset,
epochs=10,  # Adjust as needed
validation_data=test_dataset,
class_weight=class_weight
)

# Evaluate the model
loss, accuracy = model.evaluate(test_dataset)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Prediction Example
def predict_image(image_path):
img = tf.io.read_file(image_path)
img = tf.image.decode_jpeg(img, channels=3)
img = tf.image.resize(img, IMG_SIZE)
img = tf.image.convert_image_dtype(img, dtype=tf.float32) / 255.0
img = tf.expand_dims(img, axis=0)  # Add batch dimension
prediction = model.predict(img)
predicted_class = np.argmax(prediction)
return predicted_class

# Example usage:
sample_image_path = test_image_paths[0]
predicted_class = predict_image(sample_image_path)
print(f"Predicted class for {sample_image_path}: {predicted_class}")

# Save the model
model.save('skin_cancer_vit_model.h5')

make all the changes preferably after the definition of the vit model to make the code work and return the accuracy og the classificaiton of the model

Ein zuvor erstelltes ViT wurde auf einen Mode-Mnist-Datensatz zur Bildklassifizierung abgebildet und hatte eine geringe Genauigkeit. Erwarten Sie eine Genauigkeit von mindestens 50 % für 10 Epochen. Ich gehe davon aus, dass die meisten Änderungen in meiner Bildvorverarbeitung und -erweiterung vorgenommen werden, da mein Modell gut mit anderen Datensätzen funktioniert.
ViT(fashionMnist)

Code: Select all

import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

class ClassToken(Layer):
def __init__(self, **kwargs):
super().__init__(**kwargs)

def build(self, input_shape):
self.cls = self.add_weight(             #adding a trainable paramter to a custom layer
name="cls",                         #name of the weight
shape=(1, 1, input_shape[-1]),
initializer="zeros",
trainable=True,
)

def call(self, x):
batch_size = tf.shape(x)[0]
cls = tf.tile(self.cls, [batch_size, 1, 1])
x = tf.concat([cls, x], axis=1)
return x

def mlp(x, cf):
x = Dense(cf["mlp_dim"], activation="gelu")(x)
x = Dropout(cf["dropout_rate"])(x)
x = Dense(cf["hidden_dim"])(x)
x = Dropout(cf["dropout_rate"])(x)
return x

def transformer_encoder(x, cf):
skip_1 = x
x = LayerNormalization()(x)
x = MultiHeadAttention(
num_heads=cf["num_heads"], key_dim=cf["hidden_dim"]
)(x, x)
x = Add()([x, skip_1])

skip_2 = x
x = LayerNormalization()(x)
x = mlp(x, cf)
x = Add()([x, skip_2])

return x

def ViT(cf):

input_shape = (cf["num_patches"], cf["patch_size"]*cf["patch_size"]*cf["num_channels"])
inputs = Input(input_shape)

patch_embed = Dense(cf["hidden_dim"])(inputs)

positions = tf.range(start=0, limit=cf["num_patches"], delta=1)
pos_embed = Embedding(input_dim=cf["num_patches"], output_dim=cf["hidden_dim"])(positions)
embed = patch_embed + pos_embed

x = ClassToken()(embed)

for _ in range(cf["num_layers"]):
x = transformer_encoder(x, cf)

x = LayerNormalization()(x)
x = x[:, 0, :]
x = Dense(cf["num_classes"], activation="softmax")(x)

model = Model(inputs, x)
return model

def preprocess_image(image, patch_size):
image = tf.reshape(image, (28, 28, 1))
image = tf.image.resize(image, (28, 28))
image = tf.expand_dims(image, axis=0)
image = tf.cast(image, tf.float32) / 255.0
patches = tf.image.extract_patches(
images=image,
sizes=[1, patch_size, patch_size, 1],
strides=[1, patch_size, patch_size, 1],
rates=[1, 1, 1, 1],
padding='VALID'
)
patches = tf.reshape(patches, (patches.shape[0], -1, patches.shape[-1]))
return patches

if __name__ == "__main__":
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

subset_size = 5000
x_train, y_train = x_train[:subset_size], y_train[:subset_size]
x_test, y_test = x_test[:1000], y_test[:1000]

x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

patch_size = 14
num_patches = (28 // patch_size) ** 2

x_train = tf.image.extract_patches(
images=tf.expand_dims(x_train, axis=-1),
sizes=[1, patch_size, patch_size, 1],
strides=[1, patch_size, patch_size, 1],
rates=[1, 1, 1, 1],
padding='VALID'
)
x_train = tf.reshape(x_train, (x_train.shape[0], num_patches, -1))

x_test = tf.image.extract_patches(
images=tf.expand_dims(x_test, axis=-1),
sizes=[1, patch_size, patch_size, 1],
strides=[1, patch_size, patch_size, 1],
rates=[1, 1, 1, 1],
padding='VALID'
)
x_test = tf.reshape(x_test, (x_test.shape[0], num_patches, -1))

config = {
"num_layers": 4,
"hidden_dim": 128,
"mlp_dim": 256,
"num_heads": 4,
"dropout_rate": 0.1,
"num_patches": num_patches,
"patch_size": patch_size,
"num_channels": 1,
"num_classes":  10
}

model = ViT(config)
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])

model.fit(x_train, y_train, epochs=5, batch_size=32, validation_data=(x_test, y_test))

model.save("vit_fashion_mnist_cls.h5")

print("Model trained. Ready to classify a new image.")

y_pred = model.predict(x_test)
y_pred_classes = tf.argmax(y_pred, axis=1).numpy()

print("Evaluation Metrics on Test Set:")
print(classification_report(y_test, y_pred_classes))

new_image = x_test[19]

prediction = model.predict(tf.expand_dims(new_image,axis=0))
predicted_class = tf.argmax(prediction, axis=1).numpy()[0]
print(f"Predicted Class: {predicted_class}")

versucht, diese Implementierung auf dem PAD-UFES-20-Datensatz abzubilden.

Geringe Genauigkeit bei der Verwendung von Bildtransformatoren zur Bildklassifizierung

Geringe Genauigkeit bei der Verwendung von Bildtransformatoren zur Bildklassifizierung ⇐ Python

Quick Reply