Ich habe die Funktionsweise eines Vision-Transformators gelernt, konnte ihn jedoch zunächst nicht zum Laufen bringen (den ViT von Grund auf neu erstellen). Aber irgendwie ist es mir gelungen, einen Code zu verschlüsseln, der eine sehr geringe Genauigkeit aufweist (3 %).
Ich habe versucht, mich durch ihn zu chatten, konnte es aber scheinbar nicht herausfinden.
Ich habe das gleiche Verfahren verwendet, das ich bereits verwendet habe ein zuvor erstelltes Vit auf einem anderen Datensatz (fashionMnist), der eine sehr gute Genauigkeit lieferte und versuchte, es auf diesem Datensatz abzubilden.
from google.colab import files
files.upload()
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d mahdavi1202/skin-cancer
import zipfile
import os
with zipfile.ZipFile('skin-cancer.zip', 'r') as zip_ref:
zip_ref.extractall('skin-cancer')
os.listdir('skin-cancer')
import pandas as pd
import shutil
os.mkdir("/content/all_images")
source_dir="skin-cancer/imgs_part_1/imgs_part_1"
file_names = os.listdir(source_dir)
for file_name in file_names:
shutil.move(os.path.join(source_dir, file_name), "/content/all_images")
source_dir="skin-cancer/imgs_part_2/imgs_part_2"
file_names = os.listdir(source_dir)
for file_name in file_names:
shutil.move(os.path.join(source_dir, file_name), "/content/all_images")
source_dir="skin-cancer/imgs_part_3/imgs_part_3"
file_names = os.listdir(source_dir)
for file_name in file_names:
shutil.move(os.path.join(source_dir, file_name), "/content/all_images")
data = pd.read_csv("skin-cancer/metadata.csv")
data['full_link'] = '/kaggle/working/full_images/' + data['img_id']
data.info()
import matplotlib.pyplot as plt
fig, ax1 = plt.subplots(1, 1, figsize= (10, 5))
data['diagnostic'].value_counts().plot(kind='bar', ax=ax1)
diagnostic_classes = {0:'BCC', 1 : 'ACK', 2 : 'NEV', 3 : 'SEK', 4 : 'SCC', 5: 'MEL'}
# a function for encoding classes
def create_class(X):
if X == 'BCC':
return 0
elif X =='ACK':
return 1
elif X == 'NEV':
return 2
elif X == 'SEK':
return 3
elif X == 'SCC':
return 4
elif X == 'MEL':
return 5
else:
print('error class')
data['encoded_class'] = data['diagnostic'].apply(create_class)
data.drop(['diagnostic'], axis = 1, inplace = True)
data.sort_values(by ='patient_id', ascending = True, inplace = True, ignore_index = True)
data.info()
data.drop([ 'biopsed','patient_id','img_id','lesion_id','smoke', 'drink', 'background_father', 'background_mother', 'pesticide', 'gender', 'skin_cancer_history',
'cancer_history', 'has_piped_water', 'has_sewage_system', 'fitspatrick', 'diameter_1', 'diameter_2'], axis = 1, inplace = True)
data.info()
from sklearn.utils import shuffle
import tensorflow as tf
import numpy as np
IMG_SIZE = 32,32
BATCH_SIZE = 256
SEED = 55
AUTO = tf.data.AUTOTUNE
train_data = data[:2000]
test_data = data[2000:]
test_data = shuffle(test_data, random_state = SEED).reset_index(drop = True)
print('train ->', train_data.shape)
print('test ->', test_data.shape)
counts = np.bincount(train_data['encoded_class'])
weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]
weight_for_2 = 1.0 / counts[2]
weight_for_3 = 1.0 / counts[3]
weight_for_4 = 1.0 / counts[4]
weight_for_5 = 1.0 / counts[5]
class_weight = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2, 3: weight_for_3, 4: weight_for_4, 5: weight_for_5}
class_weight
data.info()
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import cv2
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
class ClassToken(Layer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def build(self, input_shape):
self.cls = self.add_weight( #adding a trainable paramter to a custom layer
name="cls", #name of the weight
shape=(1, 1, input_shape[-1]),
initializer="zeros",
trainable=True,
)
def call(self, x):
batch_size = tf.shape(x)[0]
cls = tf.tile(self.cls, [batch_size, 1, 1])
x = tf.concat([cls, x], axis=1)
return x
def mlp(x, cf):
x = Dense(cf["mlp_dim"], activation="gelu")(x)
x = Dropout(cf["dropout_rate"])(x)
x = Dense(cf["hidden_dim"])(x)
x = Dropout(cf["dropout_rate"])(x)
return x
def transformer_encoder(x, cf):
skip_1 = x
x = LayerNormalization()(x)
x = MultiHeadAttention(
num_heads=cf["num_heads"], key_dim=cf["hidden_dim"]
)(x, x)
x = Add()([x, skip_1])
skip_2 = x
x = LayerNormalization()(x)
x = mlp(x, cf)
x = Add()([x, skip_2])
return x
def ViT(cf):
input_shape = (cf["num_patches"], cf["patch_size"]*cf["patch_size"]*cf["num_channels"])
inputs = Input(input_shape)
patch_embed = Dense(cf["hidden_dim"])(inputs)
positions = tf.range(start=0, limit=cf["num_patches"], delta=1)
pos_embed = Embedding(input_dim=cf["num_patches"], output_dim=cf["hidden_dim"])(positions)
embed = patch_embed + pos_embed
x = ClassToken()(embed)
for _ in range(cf["num_layers"]):
x = transformer_encoder(x, cf)
x = LayerNormalization()(x)
x = x[:, 0, :]
x = Dense(cf["num_classes"], activation="softmax")(x)
model = Model(inputs, x)
return model
def preprocess_image(image, patch_size, target_size=(32, 32)): # IMG_SIZE is (32, 32)
# Decode the image within the function if it's a byte string
if isinstance(image, bytes):
image = tf.image.decode_jpeg(image, channels=3) # Decode byte string to tensor
# Ensure image has 3 dimensions (height, width, channels) before resizing
image = tf.ensure_shape(image, [None, None, 3])
image = tf.image.resize(image, target_size)
image = tf.cast(image, tf.float32) / 255.0
# Remove tf.expand_dims(image, axis=0) to avoid extra dimension
patches = tf.image.extract_patches(
images=image,
sizes=[1, patch_size, patch_size, 1],
strides=[1, patch_size, patch_size, 1],
rates=[1, 1, 1, 1],
padding='VALID'
)
# Reshape to (num_patches, patch_size * patch_size * num_channels)
# This should match the input shape expected by your ViT model
patches = tf.reshape(patches, [-1, patch_size * patch_size * 3])
return patches
def create_dataset(df):
image_paths = df['full_link'].values
labels = df['encoded_class'].values
dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))
# Pass the raw file content to preprocess_image
dataset = dataset.map(lambda path, label: (preprocess_image(tf.io.read_file(tf.strings.as_string(path)), cf["patch_size"]), label))
dataset = dataset.batch(BATCH_SIZE)
return dataset
image_dir = "all_images"
data.info()
import tensorflow as tf
def load_and_preprocess_image(image_path, label):
"""Loads an image from a file path, preprocesses it, and returns a tuple of (image, label).
Args:
image_path: The path to the image file.
label: The label associated with the image.
Returns:
A tuple containing the preprocessed image and label.
"""
img = tf.io.read_file(image_path)
img = tf.image.decode_jpeg(img, channels=3)
img = tf.image.resize(img, IMG_SIZE) # Assuming IMG_SIZE is defined as (32, 32)
img = tf.cast(img, tf.float32) / 255.0 # Normalize pixel values
# Convert the image into patches here
img = preprocess_image(img, patch_size=cf["patch_size"]) # patch_size is defined in the cf dictionary
return img, label
def augment(image, label):
"""Applies data augmentation to an image.
Args:
image: The image to augment.
label: The label associated with the image.
Returns:
A tuple containing the augmented image and label.
"""
# You can add more augmentation techniques as needed
image = tf.image.random_flip_left_right(image)
image = tf.image.random_flip_up_down(image)
image = tf.image.random_brightness(image, max_delta=0.2) # Adjust brightness
image = tf.image.random_contrast(image, lower=0.8, upper=1.2) # Adjust contrast
# Other augmentations: random rotation, cropping, etc.
return image, label
IMG_SIZE = (32, 32)
BATCH_SIZE = 256
SEED = 55
AUTO = tf.data.AUTOTUNE
# Create file paths (assuming all images are in /content/all_images)
all_image_files = os.listdir("/content/all_images")
image_paths = ["/content/all_images/" + filename for filename in all_image_files]
# Create labels based on file order and original DataFrame order
labels = []
filenames_no_ext = [f.split('.')[0] for f in all_image_files]
# Ensure correct mapping even after shuffling filenames
for filename in filenames_no_ext:
matching_row = data[data['full_link'].str.contains(filename)] #using full_link which wasn't dropped
if not matching_row.empty:
labels.append(matching_row['encoded_class'].values[0])
else:
print(f"Warning: No matching entry found for file: {filename}")
#Handle missing images in the CSV. One option is to skip or assign a default label
#labels.append(-1) #Example of assigning -1 as a missing label
labels = np.array(labels)
image_paths = np.array(image_paths)
# Shuffle image paths and labels together
image_paths, labels = shuffle(image_paths, labels, random_state=SEED)
# Split data
train_image_paths = image_paths[:2000]
train_labels = labels[:2000]
test_image_paths = image_paths[2000:]
test_labels = labels[2000:]
train_dataset = tf.data.Dataset.from_tensor_slices((train_image_paths, train_labels))
train_dataset = (
train_dataset.map(load_and_preprocess_image, num_parallel_calls=AUTO)
.map(augment, num_parallel_calls=AUTO) # Augmentation
.shuffle(buffer_size=len(train_image_paths))
.batch(BATCH_SIZE)
.prefetch(AUTO)
)
test_dataset = tf.data.Dataset.from_tensor_slices((test_image_paths, test_labels))
test_dataset = (
test_dataset.map(load_and_preprocess_image, num_parallel_calls=AUTO)
.batch(BATCH_SIZE)
.prefetch(AUTO)
)
# ... (Your ViT model definition remains the same) ...
cf = {
"num_patches": (IMG_SIZE[0] // 4) * (IMG_SIZE[1] // 4), #adjust patch size if needed
"patch_size": 4,
"num_channels": 3,
"hidden_dim": 64,
"mlp_dim": 128,
"num_heads": 4,
"num_layers": 4,
"dropout_rate": 0.1,
"num_classes": 6,
}
model = ViT(cf)
# Compile the model
model.compile(
loss="sparse_categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"],
)
# Train the model
history = model.fit(
train_dataset,
epochs=10, # Adjust as needed
validation_data=test_dataset,
class_weight=class_weight
)
# Evaluate the model
loss, accuracy = model.evaluate(test_dataset)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
# Prediction Example
def predict_image(image_path):
img = tf.io.read_file(image_path)
img = tf.image.decode_jpeg(img, channels=3)
img = tf.image.resize(img, IMG_SIZE)
img = tf.image.convert_image_dtype(img, dtype=tf.float32) / 255.0
img = tf.expand_dims(img, axis=0) # Add batch dimension
prediction = model.predict(img)
predicted_class = np.argmax(prediction)
return predicted_class
# Example usage:
sample_image_path = test_image_paths[0]
predicted_class = predict_image(sample_image_path)
print(f"Predicted class for {sample_image_path}: {predicted_class}")
# Save the model
model.save('skin_cancer_vit_model.h5')
make all the changes preferably after the definition of the vit model to make the code work and return the accuracy og the classificaiton of the model
Ein zuvor erstelltes ViT wurde auf einen Mode-Mnist-Datensatz zur Bildklassifizierung abgebildet und hatte eine geringe Genauigkeit. Erwarten Sie eine Genauigkeit von mindestens 50 % für 10 Epochen. Ich gehe davon aus, dass die meisten Änderungen in meiner Bildvorverarbeitung und -erweiterung vorgenommen werden, da mein Modell gut mit anderen Datensätzen funktioniert.
ViT(fashionMnist)
Ich habe die Funktionsweise eines Vision-Transformators gelernt, konnte ihn jedoch zunächst nicht zum Laufen bringen (den ViT von Grund auf neu erstellen). Aber irgendwie ist es mir gelungen, einen Code zu verschlüsseln, der eine sehr geringe Genauigkeit aufweist (3 %). Ich habe versucht, mich durch ihn zu chatten, konnte es aber scheinbar nicht herausfinden. Ich habe das gleiche Verfahren verwendet, das ich bereits verwendet habe ein zuvor erstelltes Vit auf einem anderen Datensatz (fashionMnist), der eine sehr gute Genauigkeit lieferte und versuchte, es auf diesem Datensatz abzubilden. [code]from google.colab import files files.upload()
class_weight = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2, 3: weight_for_3, 4: weight_for_4, 5: weight_for_5} class_weight data.info() import os os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" import cv2 import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.metrics import classification_report from tensorflow.keras.layers import * from tensorflow.keras.models import Model
class ClassToken(Layer): def __init__(self, **kwargs): super().__init__(**kwargs)
def build(self, input_shape): self.cls = self.add_weight( #adding a trainable paramter to a custom layer name="cls", #name of the weight shape=(1, 1, input_shape[-1]), initializer="zeros", trainable=True, )
def call(self, x): batch_size = tf.shape(x)[0] cls = tf.tile(self.cls, [batch_size, 1, 1]) x = tf.concat([cls, x], axis=1) return x
def mlp(x, cf): x = Dense(cf["mlp_dim"], activation="gelu")(x) x = Dropout(cf["dropout_rate"])(x) x = Dense(cf["hidden_dim"])(x) x = Dropout(cf["dropout_rate"])(x) return x
def transformer_encoder(x, cf): skip_1 = x x = LayerNormalization()(x) x = MultiHeadAttention( num_heads=cf["num_heads"], key_dim=cf["hidden_dim"] )(x, x) x = Add()([x, skip_1])
skip_2 = x x = LayerNormalization()(x) x = mlp(x, cf) x = Add()([x, skip_2])
for _ in range(cf["num_layers"]): x = transformer_encoder(x, cf)
x = LayerNormalization()(x) x = x[:, 0, :] x = Dense(cf["num_classes"], activation="softmax")(x)
model = Model(inputs, x) return model
def preprocess_image(image, patch_size, target_size=(32, 32)): # IMG_SIZE is (32, 32) # Decode the image within the function if it's a byte string if isinstance(image, bytes): image = tf.image.decode_jpeg(image, channels=3) # Decode byte string to tensor
# Ensure image has 3 dimensions (height, width, channels) before resizing image = tf.ensure_shape(image, [None, None, 3])
image = tf.image.resize(image, target_size) image = tf.cast(image, tf.float32) / 255.0 # Remove tf.expand_dims(image, axis=0) to avoid extra dimension
patches = tf.image.extract_patches( images=image, sizes=[1, patch_size, patch_size, 1], strides=[1, patch_size, patch_size, 1], rates=[1, 1, 1, 1], padding='VALID' ) # Reshape to (num_patches, patch_size * patch_size * num_channels) # This should match the input shape expected by your ViT model patches = tf.reshape(patches, [-1, patch_size * patch_size * 3]) return patches
dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels)) # Pass the raw file content to preprocess_image dataset = dataset.map(lambda path, label: (preprocess_image(tf.io.read_file(tf.strings.as_string(path)), cf["patch_size"]), label)) dataset = dataset.batch(BATCH_SIZE) return dataset
image_dir = "all_images"
data.info() import tensorflow as tf
def load_and_preprocess_image(image_path, label): """Loads an image from a file path, preprocesses it, and returns a tuple of (image, label).
Args: image_path: The path to the image file. label: The label associated with the image.
Returns: A tuple containing the preprocessed image and label. """ img = tf.io.read_file(image_path) img = tf.image.decode_jpeg(img, channels=3) img = tf.image.resize(img, IMG_SIZE) # Assuming IMG_SIZE is defined as (32, 32) img = tf.cast(img, tf.float32) / 255.0 # Normalize pixel values # Convert the image into patches here img = preprocess_image(img, patch_size=cf["patch_size"]) # patch_size is defined in the cf dictionary return img, label
def augment(image, label): """Applies data augmentation to an image.
Args: image: The image to augment. label: The label associated with the image.
Returns: A tuple containing the augmented image and label. """ # You can add more augmentation techniques as needed image = tf.image.random_flip_left_right(image) image = tf.image.random_flip_up_down(image) image = tf.image.random_brightness(image, max_delta=0.2) # Adjust brightness image = tf.image.random_contrast(image, lower=0.8, upper=1.2) # Adjust contrast # Other augmentations: random rotation, cropping, etc. return image, label IMG_SIZE = (32, 32) BATCH_SIZE = 256 SEED = 55 AUTO = tf.data.AUTOTUNE
# Create file paths (assuming all images are in /content/all_images) all_image_files = os.listdir("/content/all_images") image_paths = ["/content/all_images/" + filename for filename in all_image_files]
# Create labels based on file order and original DataFrame order labels = [] filenames_no_ext = [f.split('.')[0] for f in all_image_files]
# Ensure correct mapping even after shuffling filenames for filename in filenames_no_ext: matching_row = data[data['full_link'].str.contains(filename)] #using full_link which wasn't dropped if not matching_row.empty: labels.append(matching_row['encoded_class'].values[0]) else: print(f"Warning: No matching entry found for file: {filename}") #Handle missing images in the CSV. One option is to skip or assign a default label #labels.append(-1) #Example of assigning -1 as a missing label
# Example usage: sample_image_path = test_image_paths[0] predicted_class = predict_image(sample_image_path) print(f"Predicted class for {sample_image_path}: {predicted_class}")
# Save the model model.save('skin_cancer_vit_model.h5')
make all the changes preferably after the definition of the vit model to make the code work and return the accuracy og the classificaiton of the model [/code] Ein zuvor erstelltes ViT wurde auf einen Mode-Mnist-Datensatz zur Bildklassifizierung abgebildet und hatte eine geringe Genauigkeit. Erwarten Sie eine Genauigkeit von mindestens 50 % für 10 Epochen. Ich gehe davon aus, dass die meisten Änderungen in meiner Bildvorverarbeitung und -erweiterung vorgenommen werden, da mein Modell gut mit anderen Datensätzen funktioniert. ViT(fashionMnist) [code]import os os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" from sklearn.metrics import classification_report import tensorflow as tf from tensorflow.keras.datasets import fashion_mnist from tensorflow.keras.layers import * from tensorflow.keras.models import Model
class ClassToken(Layer): def __init__(self, **kwargs): super().__init__(**kwargs)
def build(self, input_shape): self.cls = self.add_weight( #adding a trainable paramter to a custom layer name="cls", #name of the weight shape=(1, 1, input_shape[-1]), initializer="zeros", trainable=True, )
def call(self, x): batch_size = tf.shape(x)[0] cls = tf.tile(self.cls, [batch_size, 1, 1]) x = tf.concat([cls, x], axis=1) return x
def mlp(x, cf): x = Dense(cf["mlp_dim"], activation="gelu")(x) x = Dropout(cf["dropout_rate"])(x) x = Dense(cf["hidden_dim"])(x) x = Dropout(cf["dropout_rate"])(x) return x
def transformer_encoder(x, cf): skip_1 = x x = LayerNormalization()(x) x = MultiHeadAttention( num_heads=cf["num_heads"], key_dim=cf["hidden_dim"] )(x, x) x = Add()([x, skip_1])
skip_2 = x x = LayerNormalization()(x) x = mlp(x, cf) x = Add()([x, skip_2])
Ich mache eine mit der ML -Modellierung integrierte Geospatial -Bewertung. Das Problem ist der Prozentsatz der sehr geringen Genauigkeit, da mehr Trainingsfunktionen zunehmen, und es wird niedriger....
Ich habe ein Problem mit meinem CNN. Ich muss einen CNN erstellen, der den a- und b-Wert des zentralen Pixels eines 13x13-Pixel-Bildes vorhersagt. Ich habe nur Conv-Layer und als...
Ich habe einen Algorithmus in einer dezentralen Umgebung (Dezentraled Federated Learning) implementiert. Als ich mit mnist und fashion-mnist experimentierte, habe ich eine Genauigkeit von...
Ich mache mein erstes neuronales Netzwerk /künstliche Intelligenz. Um den Code erfolgreich auszuführen, ohne Fehler (die ich sehen kann), aber leider ist die Validierung und allgemeine Genauigkeit...
Ich versuche, meinen Code mithilfe von HPX zu parallelisieren, um die Leistung zu verbessern. Unten ist der Originalcode und mein Versuch, ihn mit HPX umzugestalten.
Originalcode:
std::vector...