def get_device_via_env_variables(deterministic: bool = False, verbose: bool = True) -> torch.device:
device: torch.device = torch.device("cpu")
if torch.cuda.is_available():
if 'CUDA_VISIBLE_DEVICES' not in os.environ:
device: torch.device = torch.device("cuda:0")
else:
gpu_idx: list[str] = os.environ['CUDA_VISIBLE_DEVICES'].split(',')
if len(gpu_idx) == 1:
gpu_idx: str = gpu_idx[0]
else:
# generate random int from 0 to len(gpu_idx) with import statement
import random
idx: int = random.randint(0, len(gpu_idx) - 1) if not deterministic else -1
gpu_idx: str = gpu_idx[idx]
device: torch.device = torch.device(f"cuda:{gpu_idx}")
if verbose:
print(f'{device=}')
return device
Ich habe den Verdacht, dass gpu_idx und CUDA_VISIBLE_DEVICES nicht wirklich übereinstimmen ... Ich möchte nur die richtige GPU laden. Wie mache ich das?
Fehler:
Traceback (most recent call last):aded (0.000 MB deduped)
File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1368, in
main_data_analyis()
File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1163, in main_data_analyis
args: Namespace = load_args()
File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1152, in load_args
args.meta_learner = get_maml_meta_learner(args)
File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/data_analysis/common.py", line 272, in get_maml_meta_learner
base_model = load_model_ckpt(args, path_to_checkpoint=args.path_2_init_maml)
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/mains/common.py", line 265, in load_model_ckpt
base_model, _, _ = load_model_optimizer_scheduler_from_ckpt(args, path_to_checkpoint,
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/mains/common.py", line 81, in load_model_optimizer_scheduler_from_ckpt
ckpt: dict = torch.load(path_to_checkpoint, map_location=torch.device('cuda:3'))
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 607, in load
return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 882, in _load
result = unpickler.load()
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 857, in persistent_load
load_tensor(data_type, size, key, _maybe_decode_ascii(location))
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 846, in load_tensor
loaded_storages[key] = restore_location(storage, location)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 827, in restore_location
return default_restore_location(storage, str(map_location))
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 175, in default_restore_location
result = fn(storage, location)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 151, in _cuda_deserialize
device = validate_cuda_device(location)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 142, in validate_cuda_device
raise RuntimeError('Attempting to deserialize object on CUDA device '
RuntimeError: Attempting to deserialize object on CUDA device 3 but torch.cuda.device_count() is 1. Please use torch.load with map_location to map your storages to an existing device.
Motiviert durch die Tatsache, dass ich versuche, die restlichen 40 GB von meinem 5CNN mit 256- und 512-Filtern zu nutzen, aber es zu Speicherproblemen kommt
Traceback (most recent call last):
File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1368, in
main_data_analyis()
File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1213, in main_data_analyis
stats_analysis_with_emphasis_on_effect_size(args, hist=True)
File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/data_analysis/stats_analysis_with_emphasis_on_effect_size.py", line 74, in stats_analysis_with_emphasis_on_effect_size
results_usl: dict = get_episodic_accs_losses_all_splits_usl(args, args.mdl_sl, loaders)
File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/data_analysis/common.py", line 616, in get_episodic_accs_losses_all_splits_usl
losses, accs = agent.get_lists_accs_losses(data, training)
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/pretrain_convergence.py", line 92, in get_lists_accs_losses
spt_embeddings_t = self.get_embedding(spt_x_t, self.base_model).detach()
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/pretrain_convergence.py", line 166, in get_embedding
return get_embedding(x=x, base_model=base_model)
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/pretrain_convergence.py", line 267, in get_embedding
out = base_model.model.features(x)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward
input = module(input)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 443, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 439, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
RuntimeError: CUDA out of memory. Tried to allocate 174.00 MiB (GPU 0; 79.20 GiB total capacity; 54.31 GiB already allocated; 22.56 MiB free; 54.61 GiB reserved in total by PyTorch)
Ich habe mir diesen Code ausgedacht, aber er führt zu endlosen Fehlern: [code]def get_device_via_env_variables(deterministic: bool = False, verbose: bool = True) -> torch.device: device: torch.device = torch.device("cpu") if torch.cuda.is_available(): if 'CUDA_VISIBLE_DEVICES' not in os.environ: device: torch.device = torch.device("cuda:0") else: gpu_idx: list[str] = os.environ['CUDA_VISIBLE_DEVICES'].split(',') if len(gpu_idx) == 1: gpu_idx: str = gpu_idx[0] else: # generate random int from 0 to len(gpu_idx) with import statement import random idx: int = random.randint(0, len(gpu_idx) - 1) if not deterministic else -1 gpu_idx: str = gpu_idx[idx] device: torch.device = torch.device(f"cuda:{gpu_idx}") if verbose: print(f'{device=}') return device [/code] Ich habe den Verdacht, dass gpu_idx und CUDA_VISIBLE_DEVICES nicht wirklich übereinstimmen ... [url=viewtopic.php?t=30561]Ich möchte[/url] nur die richtige GPU laden. Wie mache ich das? Fehler: [code]Traceback (most recent call last):aded (0.000 MB deduped) File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1368, in main_data_analyis() File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1163, in main_data_analyis args: Namespace = load_args() File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1152, in load_args args.meta_learner = get_maml_meta_learner(args) File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/data_analysis/common.py", line 272, in get_maml_meta_learner base_model = load_model_ckpt(args, path_to_checkpoint=args.path_2_init_maml) File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/mains/common.py", line 265, in load_model_ckpt base_model, _, _ = load_model_optimizer_scheduler_from_ckpt(args, path_to_checkpoint, File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/mains/common.py", line 81, in load_model_optimizer_scheduler_from_ckpt ckpt: dict = torch.load(path_to_checkpoint, map_location=torch.device('cuda:3')) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 607, in load return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 882, in _load result = unpickler.load() File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 857, in persistent_load load_tensor(data_type, size, key, _maybe_decode_ascii(location)) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 846, in load_tensor loaded_storages[key] = restore_location(storage, location) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 827, in restore_location return default_restore_location(storage, str(map_location)) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 175, in default_restore_location result = fn(storage, location) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 151, in _cuda_deserialize device = validate_cuda_device(location) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/serialization.py", line 142, in validate_cuda_device raise RuntimeError('Attempting to deserialize object on CUDA device ' RuntimeError: Attempting to deserialize object on CUDA device 3 but torch.cuda.device_count() is 1. Please use torch.load with map_location to map your storages to an existing device. [/code] Motiviert durch die Tatsache, dass ich versuche, die restlichen 40 GB von meinem 5CNN mit 256- und 512-Filtern zu nutzen, aber es zu Speicherproblemen kommt [code]Traceback (most recent call last): File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1368, in main_data_analyis() File "/lfs/ampere1/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_experiment_analysis_sl_vs_maml_performance_comp_distance.py", line 1213, in main_data_analyis stats_analysis_with_emphasis_on_effect_size(args, hist=True) File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/data_analysis/stats_analysis_with_emphasis_on_effect_size.py", line 74, in stats_analysis_with_emphasis_on_effect_size results_usl: dict = get_episodic_accs_losses_all_splits_usl(args, args.mdl_sl, loaders) File "/afs/cs.stanford.edu/u/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/data_analysis/common.py", line 616, in get_episodic_accs_losses_all_splits_usl losses, accs = agent.get_lists_accs_losses(data, training) File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/pretrain_convergence.py", line 92, in get_lists_accs_losses spt_embeddings_t = self.get_embedding(spt_x_t, self.base_model).detach() File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/pretrain_convergence.py", line 166, in get_embedding return get_embedding(x=x, base_model=base_model) File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/pretrain_convergence.py", line 267, in get_embedding out = base_model.model.features(x) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl return forward_call(*input, **kwargs) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/container.py", line 139, in forward input = module(input) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl return forward_call(*input, **kwargs) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 443, in forward return self._conv_forward(input, self.weight, self.bias) File "/lfs/ampere1/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 439, in _conv_forward return F.conv2d(input, weight, bias, self.stride, RuntimeError: CUDA out of memory. Tried to allocate 174.00 MiB (GPU 0; 79.20 GiB total capacity; 54.31 GiB already allocated; 22.56 MiB free; 54.61 GiB reserved in total by PyTorch) [/code] Ich möchte GPU 3 verwenden, aber der letzte Fehler sagt GPU 0. Was mache ich falsch? Cross: https://discuss.pytorch.org/t/how-do-you-load-a-special-gpu-from-cuda-available-devices-in-pytorch/174044
Ich habe Nvidia-Driver-580 und cuda-tools-13 auf Debian Trixie installiert (beide sind die neueste Version, die ich finden kann): $ a p t l i s t - - i n s t a l l e d | a g n v i d i a f i r m w a r...
Ich verwende Windows 11 mit einer NVIDIA GeForce RTX 5070 (Rechenfähigkeit sm_120) und versuche, eine GeoAI + QGIS + GPU-PyTorch-Umgebung zu erstellen.
Grundsätzlich versuche ich, die neueste Version...
Ich verwende die C# Windows.Devices.bluetooth -APIs, um mit einem Gerät über Ble zu kommunizieren. Das Trennen gibt es jedoch keine direkte Option. Die einzige Option scheint zu sein, dass alle...
Ich habe Probleme mit TensorFlow, wenn ich meine GPU nicht sehen kann. Ich weiß nicht, ob es ein Problem mit der Installation von CUDA 11.8 gibt, wenn mein Nvidia-Smi sagt: NVIDIA-SMI 572.70...
Ich versuche, ein Swift-Makro zu erstellen, das das Attribut @available(*, veraltet) allen Eigenschaften einer Struktur bedingt hinzufügt, wenn das Makro angewendet wird. Ich möchte dieses Verhalten...