Upgrade to Synapse AI Release 1.12.1 (#106)

* Upgrade to Synapse AI Release 1.12.1 Update images and modify hooks to suit lightning 2.1 Signed-off-by: Jerome <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <[email protected]>
Lightning-AI · Oct 25, 2023 · 5b00a32 · 5b00a32
1 parent 73149db
commit 5b00a32
Show file tree

Hide file tree

Showing 8 changed files with 54 additions and 22 deletions.
diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml
@@ -30,15 +30,15 @@ jobs:
     strategy:
       matrix:
         'w. pytorch-lightning | pypi':
-          image: "1.12.0/ubuntu22.04/habanalabs/pytorch-installer-2.0.1:latest"
+          image: "1.12.1/ubuntu22.04/habanalabs/pytorch-installer-2.0.1:latest"
           dependency: "pytorch-lightning"
           pkg_source: "pypi"
         'w. lightning | pypi':
-          image: "1.12.0/ubuntu22.04/habanalabs/pytorch-installer-2.0.1:latest"
+          image: "1.12.1/ubuntu22.04/habanalabs/pytorch-installer-2.0.1:latest"
           dependency: "lightning"
           pkg_source: "pypi"
         'w. lightning | source':
-          image: "1.12.0/ubuntu22.04/habanalabs/pytorch-installer-2.0.1:latest"
+          image: "1.12.1/ubuntu22.04/habanalabs/pytorch-installer-2.0.1:latest"
           dependency: "lightning"
           pkg_source: "source"
     pool: "intel-hpus"
@@ -52,7 +52,7 @@ jobs:
                 --shm-size=4g \
                 -v /usr/bin/docker:/tmp/docker:ro"
     variables:
-      DEEPSPEED_VERSION: "1.12.0"
+      DEEPSPEED_VERSION: "1.12.1"
     workspace:
       clean: all
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,29 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+-
+
+### Changed
+
+-
+
+### Fixed
+
+-
+
+### Removed
+
+-
+
+### Deprecated
+
+-
+
+
+## [1.2.0] - 2023-10-26
+
+### Added
+
 - Added tests, examples and documentation for HPUPrecisionPlugin with autocast ([#94](https://github.com/Lightning-AI/lightning-Habana/pull/94))
 - Added test to validate checkpoint resuming with HPUDeepSpeedStrategy ([#95](https://github.com/Lightning-AI/lightning-Habana/pull/95))
 - Added support for lightning 2.1 ([#100](https://github.com/Lightning-AI/lightning-Habana/pull/100), [#105](https://github.com/Lightning-AI/lightning-Habana/pull/105))
@@ -17,6 +40,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Changed HPU docker image based on synapse AI release 1.12.0 ([#90](https://github.com/Lightning-AI/lightning-Habana/pull/90))
 - Use standard API's and Remove env variable to get HPU distributed backend ([#91](https://github.com/Lightning-AI/lightning-Habana/pull/91))
+- Changed HPU docker image based on synapse AI release 1.12.1, updated hooks ([#106](https://github.com/Lightning-AI/lightning-Habana/pull/106))
 
 
 ### Fixed

diff --git a/README.md b/README.md
@@ -63,11 +63,11 @@ The `devices>1` parameter with HPUs enables the Habana accelerator for distribut
 
 # Support Matrix
 
-| **SynapseAI**         | **1.12.0**                                         |
+| **SynapseAI**         | **1.12.1**                                         |
 | --------------------- | -------------------------------------------------- |
 | PyTorch               | 2.0.1                                              |
 | (PyTorch) Lightning\* | 2.1.x                                              |
-| **Lightning Habana**  | **1.1.0**                                          |
+| **Lightning Habana**  | **1.2.0**                                          |
 | DeepSpeed\*\*         | Forked from v0.9.4 of the official DeepSpeed repo. |
 
 \* covers both packages [`lightning`](https://pypi.org/project/lightning/) and [`pytorch-lightning`](https://pypi.org/project/pytorch-lightning/)

diff --git a/src/lightning_habana/__about__.py b/src/lightning_habana/__about__.py
@@ -1,4 +1,4 @@
-__version__ = "1.1.1.dev"
+__version__ = "1.2.0"
 __author__ = "Lightning-AI et al."
 __author_email__ = "[email protected]"
 __license__ = "Apache-2.0"

diff --git a/src/lightning_habana/pytorch/accelerator.py b/src/lightning_habana/pytorch/accelerator.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 from typing import Any, Dict, List, Optional, Union
 
 import torch
@@ -54,6 +55,9 @@ def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]:
         return get_device_stats(device)
 
     def teardown(self) -> None:
+        os.environ.pop("HABANA_PROFILE", None)
+        os.environ.pop("HLS_MODULE_ID", None)
+        os.environ.pop("ID", None)
         pass
 
     @staticmethod

diff --git a/src/lightning_habana/pytorch/strategies/deepspeed.py b/src/lightning_habana/pytorch/strategies/deepspeed.py
@@ -82,8 +82,8 @@
 warning_cache = WarningCache()
 
 _HPU_DEEPSPEED_AVAILABLE = (
-    # HPU deep speed is supported only through this pip install git+https://github.com/HabanaAI/[email protected].0
-    RequirementCache("deepspeed==0.9.4+hpu.synapse.v1.12.0")
+    # HPU deep speed is supported only through this pip install git+https://github.com/HabanaAI/[email protected].1
+    RequirementCache("deepspeed==0.9.4+hpu.synapse.v1.12.1")
 )
 if TYPE_CHECKING and _HPU_DEEPSPEED_AVAILABLE:
     import deepspeed
@@ -295,7 +295,7 @@ def __init__(
         if not _HPU_DEEPSPEED_AVAILABLE:
             raise MisconfigurationException(
                 "To use the `HPUDeepSpeedStrategy`, you must have hpu DeepSpeed installed."
-                " Install it by running `pip install git+https://github.com/HabanaAI/[email protected].0`."
+                " Install it by running `pip install git+https://github.com/HabanaAI/[email protected].1`."
             )
 
         super().__init__(

diff --git a/src/lightning_habana/pytorch/strategies/parallel.py b/src/lightning_habana/pytorch/strategies/parallel.py
@@ -27,6 +27,7 @@
     from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO
     from lightning.pytorch.plugins.precision import PrecisionPlugin
     from lightning.pytorch.strategies.ddp import DDPStrategy
+    from lightning.pytorch.utilities.types import STEP_OUTPUT
 elif module_available("pytorch_lightning"):
     from lightning_fabric.plugins import CheckpointIO, ClusterEnvironment
     from lightning_fabric.utilities.distributed import group as _group
@@ -36,6 +37,7 @@
     from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO
     from pytorch_lightning.plugins.precision import PrecisionPlugin
     from pytorch_lightning.strategies.ddp import DDPStrategy
+    from pytorch_lightning.utilities.types import STEP_OUTPUT
 else:
     raise ModuleNotFoundError("You are missing `lightning` or `pytorch-lightning` package, please install it.")
 from torch import Tensor
@@ -138,20 +140,20 @@ def optimizer_step(
         htcore.mark_step()
         return optimizer_output
 
-    def validation_step(self, batch: Any, batch_idx: int) -> Any:
+    def validation_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
         # Break lazy accumulation of graph after every step
         htcore.mark_step()
-        return super().validation_step(batch, batch_idx)
+        return super().validation_step(*args, **kwargs)
 
-    def test_step(self, batch: Any, batch_idx: int) -> Any:
+    def test_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
         # Break lazy accumulation of graph after every step
         htcore.mark_step()
-        return super().test_step(batch, batch_idx)
+        return super().test_step(*args, **kwargs)
 
-    def predict_step(self, batch: Any, batch_idx: int) -> Any:
+    def predict_step(self, *args: Any, **kwargs: Any) -> Any:
         # Break lazy accumulation of graph after every step
         htcore.mark_step()
-        return super().predict_step(batch, batch_idx)
+        return super().predict_step(*args, **kwargs)
 
     def reduce(
         self, tensor: Tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = "mean"

diff --git a/src/lightning_habana/pytorch/strategies/single.py b/src/lightning_habana/pytorch/strategies/single.py
@@ -24,6 +24,7 @@
     from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO
     from lightning.pytorch.plugins.precision import PrecisionPlugin
     from lightning.pytorch.strategies.single_device import SingleDeviceStrategy
+    from lightning.pytorch.utilities.types import STEP_OUTPUT
 elif module_available("pytorch_lightning"):
     from lightning_fabric.plugins import CheckpointIO
     from lightning_fabric.utilities.types import _DEVICE
@@ -32,6 +33,7 @@
     from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO
     from pytorch_lightning.plugins.precision import PrecisionPlugin
     from pytorch_lightning.strategies.single_device import SingleDeviceStrategy
+    from pytorch_lightning.utilities.types import STEP_OUTPUT
 else:
     raise ModuleNotFoundError("You are missing `lightning` or `pytorch-lightning` package, please install it.")
 
@@ -107,20 +109,20 @@ def optimizer_step(
         htcore.mark_step()
         return optimizer_output
 
-    def validation_step(self, batch: Any, batch_idx: int) -> Any:
+    def validation_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
         # Break lazy accumulation of graph after every step
         htcore.mark_step()
-        return super().validation_step(batch, batch_idx)
+        return super().validation_step(*args, **kwargs)
 
-    def test_step(self, batch: Any, batch_idx: int) -> Any:
+    def test_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
         # Break lazy accumulation of graph after every step
         htcore.mark_step()
-        return super().test_step(batch, batch_idx)
+        return super().test_step(*args, **kwargs)
 
-    def predict_step(self, batch: Any, batch_idx: int) -> Any:
+    def predict_step(self, *args: Any, **kwargs: Any) -> Any:
         # Break lazy accumulation of graph after every step
         htcore.mark_step()
-        return super().predict_step(batch, batch_idx)
+        return super().predict_step(*args, **kwargs)
 
     @classmethod
     def register_strategies(cls, strategy_registry: Dict) -> None: