You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I tried to train SpatialNet on WHAMR! dataset by the script python SharedTrainer.py fit --config=configs/SpatialNet.yaml --config=configs/datasets/whamr.yaml --model.arch.dim_input=12 --model.arch.dim_output=4 --model.arch.num_freqs=129 --trainer.precision=bf16-mixed --model.compile=True --data.batch_size=[2,4] --trainer.devices=0,1,2,3, --trainer.max_epochs=100, but I got an error:
Traceback (most recent call last):
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 43, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 102, in launch
return function(*args, **kwargs)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 580, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 989, in _run
results = self._run_stage()
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1033, in _run_stage
self._run_sanity_check()
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1062, in _run_sanity_check
val_loop.run()
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/loops/utilities.py", line 182, in _decorator
return loop_run(self, *args, **kwargs)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 134, in run
self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 391, in _evaluation_step
output = call._call_strategy_hook(trainer, hook_name, *step_args)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 309, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py", line 402, in validation_step
return self._forward_redirection(self.model, self.lightning_module, "validation_step", *args, **kwargs)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py", line 633, in __call__
wrapper_output = wrapper_module(*args, **kwargs)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1519, in forward
else self._run_ddp_forward(*inputs, **kwargs)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1355, in _run_ddp_forward
return self.module(*inputs, **kwargs) # type: ignore[index]
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py", line 626, in wrapped_forward
out = method(*_args, **_kwargs)
File "/data/lzx/SpatialNet/NBSS/SharedTrainer.py", line 153, in validation_step
yr_hat = self.forward(x)
File "/data/lzx/SpatialNet/NBSS/SharedTrainer.py", line 107, in forward
X, stft_paras = self.stft.stft(x[:, self.channels]) # [B,C,F,T], complex
File "/data/lzx/SpatialNet/NBSS/models/io/stft.py", line 55, in stft
X = torch.stft(x, n_fft=self.n_fft, hop_length=self.n_hop, win_length=self.win_len, window=self.window, return_complex=True)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/torch/functional.py", line 648, in stft
input = F.pad(input.view(extended_shape), [pad, pad], pad_mode)
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/data/lzx/SpatialNet/NBSS/SharedTrainer.py", line 335, in <module>
cli = TrainCLI(
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/cli.py", line 386, in __init__
self._run_subcommand(self.subcommand)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/cli.py", line 677, in _run_subcommand
fn(**fn_kwargs)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 544, in fit
call._call_and_handle_interrupt(
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 68, in _call_and_handle_interrupt
trainer._teardown()
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1012, in _teardown
self.strategy.teardown()
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/strategies/ddp.py", line 405, in teardown
super().teardown()
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/strategies/parallel.py", line 127, in teardown
super().teardown()
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/pytorch_lightning/strategies/strategy.py", line 528, in teardown
self.lightning_module.cpu()
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/lightning_fabric/utilities/device_dtype_mixin.py", line 79, in cpu
return super().cpu()
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/torch/nn/modules/module.py", line 967, in cpu
return self._apply(lambda t: t.cpu())
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/torch/nn/modules/module.py", line 810, in _apply
module._apply(fn)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/torch/nn/modules/module.py", line 810, in _apply
module._apply(fn)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/torch/nn/modules/module.py", line 810, in _apply
module._apply(fn)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/torch/nn/modules/module.py", line 833, in _apply
param_applied = fn(param)
File "/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/torch/nn/modules/module.py", line 967, in <lambda>
return self._apply(lambda t: t.cpu())
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
/opt/anaconda3/envs/lzx/lib/python3.8/site-packages/torch/lib/../../../.././libstdc++.so.6(_ZN9__gnu_cxx27__verbose_terminate_handlerEv+0xc0)[0x7fc55453ff00]
[ubuntu:39556] [ 4] /opt/anaconda3/envs/lzx/lib/python3.8/site-packages/torch/lib/../../../.././libstdc++.so.6(+0xb643c)[0x7fc55453e43c]
[ubuntu:39556] [ 5] /opt/anaconda3/envs/lzx/lib/python3.8/site-packages/torch/lib/../../../.././libstdc++.so.6(+0xb648e)[0x7fc55453e48e]
[ubuntu:39556] [ 6] /opt/anaconda3/envs/lzx/lib/python3.8/site-packages/torch/lib/../../../.././libstdc++.so.6(+0xb6435)[0x7fc55453e435]
[ubuntu:39556] [ 7] /opt/anaconda3/envs/lzx/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so(+0xcc8475)[0x7fc4b780f475]
[ubuntu:39556] [ 8] /opt/anaconda3/envs/lzx/lib/python3.8/site-packages/torch/lib/../../../.././libstdc++.so.6(+0xd3e95)[0x7fc55455be95]
[ubuntu:39556] [ 9] /lib/x86_64-linux-gnu/libpthread.so.0(+0x8609)[0x7fc5583c9609]
[ubuntu:39556] [10] /lib/x86_64-linux-gnu/libc.so.6(clone+0x43)[0x7fc558194353]
[ubuntu:39556] *** End of error message ***
Aborted (core dumped)
May I ask if you have encountered the similar problem and how to fix it?
The text was updated successfully, but these errors were encountered:
I tried to train SpatialNet on WHAMR! dataset by the script
python SharedTrainer.py fit --config=configs/SpatialNet.yaml --config=configs/datasets/whamr.yaml --model.arch.dim_input=12 --model.arch.dim_output=4 --model.arch.num_freqs=129 --trainer.precision=bf16-mixed --model.compile=True --data.batch_size=[2,4] --trainer.devices=0,1,2,3, --trainer.max_epochs=100
, but I got an error:May I ask if you have encountered the similar problem and how to fix it?
The text was updated successfully, but these errors were encountered: