From e904d310928946574c8834fd075ded510abfac6b Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Wed, 26 Mar 2025 10:06:26 -0700 Subject: [PATCH] Disable use_cuda for local_sgd_integ_tests --- torchft/local_sgd_integ_test.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/torchft/local_sgd_integ_test.py b/torchft/local_sgd_integ_test.py index 917e853..456b178 100644 --- a/torchft/local_sgd_integ_test.py +++ b/torchft/local_sgd_integ_test.py @@ -197,9 +197,11 @@ def state_dict() -> Dict[str, Dict[str, object]]: # pyre-ignore[53] class LocalSGDIntegTest(TestCase): + # TODO: race condition due to using NCCL in threads causes manager allreduce to sometimes not be correct + # Because of that the test is disabled for cuda @parameterized.expand( [ - (True,), + # (True,), (False,), ] ) @@ -259,7 +261,7 @@ def test_local_sgd_recovery(self, use_cuda: bool) -> None: @parameterized.expand( [ - (True,), + # (True,), (False,), ] ) @@ -319,7 +321,7 @@ def test_diloco_healthy(self, use_cuda: bool) -> None: @parameterized.expand( [ - (True,), + # (True,), (False,), ] )