Warum vergrößert sich mein GPU-Speicher ständig, wenn ich Modellparameter überstreiche?
Posted: 06 Jan 2025, 22:34
Ich versuche, die Fehlerrate der Modellklassifizierung mit unterschiedlichen Ausfallraten für eine bestimmte Architektur zu bewerten. Dabei erhöht sich die Speichernutzung, und ich kann dies nicht verhindern (Einzelheiten finden Sie im Code unten):
Dies ist der relevante Teil des Codes, den ich ausführe, einschließlich einiger erfolgloser Versuche, den Speicher nach jeder Iteration zu löschen.
Wie kann ich diese Schleife effektiv durchführen, ohne dass meine Speichernutzung zunimmt?
Code: Select all
N=2048 split 0 memory usage
{'current': 170630912, 'peak': 315827456}
{'current': 345847552, 'peak': 430210560}
{'current': 530811136, 'peak': 610477568}
...
{'current': 1795582208, 'peak': 1873805056}
N=2048 split 1 memory usage
{'current': 1978317568, 'peak': 2056609280}
{'current': 2157136640, 'peak': 2235356160}
...
2024-12-15 18:55:04.141690: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:497] Allocator (GPU_0_bfc) ran out of memory trying to allocate 52.00MiB (rounded to 54531328)requested by op
...
2024-12-15 18:55:04.144298: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: RESOURCE_EXHAUSTED: Out of memory while trying to allocate 54531208 bytes.
...
Code: Select all
import tensorflow as tf
import tensorflow_datasets as tfds
import gc
batch_size = 128
sizes = [2048 + n * batch_size * 5 for n in range(10)]
dropout_points = 10
vals_ds = tfds.load(
'mnist',
split=[f'train[{k}%:{k+10}%]' for k in range(0, 100, 10)],
as_supervised=True,
)
trains_ds = tfds.load(
'mnist',
split=[f'train[:{k}%]+train[{k+10}%:]' for k in range(0, 100, 10)],
as_supervised=True,
)
_, ds_info = tfds.load('mnist', with_info=True)
def normalize_img(image, label):
return tf.cast(image, tf.float32) / 255., label
for N in sizes:
for i, (ds_train, ds_test) in enumerate(zip(trains_ds, vals_ds)):
ds_train = ds_train.map(normalize_img, num_parallel_calls=tf.data.AUTOTUNE)
ds_train = ds_train.shuffle(ds_info.splits['train'].num_examples)
ds_train = ds_train.batch(128)
ds_test = ds_test.map(normalize_img, num_parallel_calls=tf.data.AUTOTUNE)
ds_test = ds_test.batch(128)
print(f"N={N} split {i} memory usage")
with open(f"out_{N}_{i}.csv", "w") as f:
f.write(("retention_rate,"
"train_loss,"
"train_err,"
"test_loss,"
"test_err,"
"epochs\n"))
for p in range(dropout_points):
dropout_rate = p / dropout_points
layers = [tf.keras.layers.Flatten(input_shape=(28, 28))]
for i in range(4):
layers.append(tf.keras.layers.Dense(N, activation='relu'))
layers.append(tf.keras.layers.Dropout(dropout_rate))
layers.append(tf.keras.layers.Dense(10))
with tf.device('/GPU:0'):
model = tf.keras.models.Sequential(layers)
model.compile(
optimizer=tf.keras.optimizers.Adam(0.001),
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
)
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
history = model.fit(
ds_train,
epochs=100,
validation_data=ds_test,
verbose=0,
callbacks=[callback]
)
train_loss, train_acc = model.evaluate(ds_train, verbose=0)
test_loss, test_acc = model.evaluate(ds_test, verbose=0)
epochs = len(history.history['loss'])
f.write((
f"{1 - dropout_rate},"
f"{train_loss},"
f"{1 - train_acc},"
f"{test_loss},"
f"{1 - test_acc},"
f"{epochs}\n"))
del model
tf.keras.backend.clear_session()
gc.collect()
print(tf.config.experimental.get_memory_info('GPU:0'))