From c3fa66d901f1a5dd7fa250e6299ac27827c3e59d Mon Sep 17 00:00:00 2001 From: pme0 <12113751+pme0@users.noreply.github.com> Date: Fri, 27 Oct 2023 12:08:29 +0100 Subject: [PATCH 01/19] registry wip --- configs/SkinLesionSegmentation.yaml | 6 + deeplightning/model/__init__.py | 12 ++ deeplightning/{models => model}/cnn.py | 0 deeplightning/{models => model}/convmixer.py | 0 deeplightning/{models => model}/dcgan.py | 0 deeplightning/{models => model}/gan.py | 0 deeplightning/{models => model}/lenet.py | 0 deeplightning/{models => model}/mlpmixer.py | 0 .../{models => model}/mobilenetv2.py | 0 .../{models => model}/mobilenetv3.py | 0 deeplightning/{models => model}/resnet.py | 0 deeplightning/{models => model}/unet.py | 0 deeplightning/{models => model}/vit.py | 0 deeplightning/{models => model}/vit2.py | 0 deeplightning/{models => model}/vit_old.py | 0 deeplightning/{models => model}/vqvae.py | 0 deeplightning/models/__init__.py | 12 -- deeplightning/task/specs.py | 34 +++++ deeplightning/trainer/metrics/__init__.py | 1 + deeplightning/utils/metrics.py | 131 ++++++++++++------ 20 files changed, 138 insertions(+), 58 deletions(-) create mode 100755 deeplightning/model/__init__.py rename deeplightning/{models => model}/cnn.py (100%) rename deeplightning/{models => model}/convmixer.py (100%) rename deeplightning/{models => model}/dcgan.py (100%) rename deeplightning/{models => model}/gan.py (100%) rename deeplightning/{models => model}/lenet.py (100%) rename deeplightning/{models => model}/mlpmixer.py (100%) rename deeplightning/{models => model}/mobilenetv2.py (100%) rename deeplightning/{models => model}/mobilenetv3.py (100%) rename deeplightning/{models => model}/resnet.py (100%) rename deeplightning/{models => model}/unet.py (100%) rename deeplightning/{models => model}/vit.py (100%) rename deeplightning/{models => model}/vit2.py (100%) rename deeplightning/{models => model}/vit_old.py (100%) rename deeplightning/{models => model}/vqvae.py (100%) delete mode 100755 deeplightning/models/__init__.py create mode 100644 deeplightning/task/specs.py create mode 100644 deeplightning/trainer/metrics/__init__.py diff --git a/configs/SkinLesionSegmentation.yaml b/configs/SkinLesionSegmentation.yaml index f9df6dd..f6e8a2f 100755 --- a/configs/SkinLesionSegmentation.yaml +++ b/configs/SkinLesionSegmentation.yaml @@ -3,6 +3,7 @@ modes: test: false task: SemanticSegmentation + data: dataset: HAM10000 root: /Users/pme/research/data/HAM10000 @@ -51,6 +52,11 @@ engine: num_nodes: 1 precision: 32 +metrics: + train: default + val: default + test: default + train: num_epochs: 10 val_every_n_epoch: 1 diff --git a/deeplightning/model/__init__.py b/deeplightning/model/__init__.py new file mode 100755 index 0000000..38b9fc0 --- /dev/null +++ b/deeplightning/model/__init__.py @@ -0,0 +1,12 @@ +from deeplightning.model.cnn import * +from deeplightning.model.convmixer import * +from deeplightning.model.dcgan import * +from deeplightning.model.gan import * +from deeplightning.model.lenet import * +from deeplightning.model.mlpmixer import * +from deeplightning.model.mobilenetv2 import * +from deeplightning.model.mobilenetv3 import * +from deeplightning.model.resnet import * +from deeplightning.model.unet import * +from deeplightning.model.vit import * +from deeplightning.model.vqvae import * diff --git a/deeplightning/models/cnn.py b/deeplightning/model/cnn.py similarity index 100% rename from deeplightning/models/cnn.py rename to deeplightning/model/cnn.py diff --git a/deeplightning/models/convmixer.py b/deeplightning/model/convmixer.py similarity index 100% rename from deeplightning/models/convmixer.py rename to deeplightning/model/convmixer.py diff --git a/deeplightning/models/dcgan.py b/deeplightning/model/dcgan.py similarity index 100% rename from deeplightning/models/dcgan.py rename to deeplightning/model/dcgan.py diff --git a/deeplightning/models/gan.py b/deeplightning/model/gan.py similarity index 100% rename from deeplightning/models/gan.py rename to deeplightning/model/gan.py diff --git a/deeplightning/models/lenet.py b/deeplightning/model/lenet.py similarity index 100% rename from deeplightning/models/lenet.py rename to deeplightning/model/lenet.py diff --git a/deeplightning/models/mlpmixer.py b/deeplightning/model/mlpmixer.py similarity index 100% rename from deeplightning/models/mlpmixer.py rename to deeplightning/model/mlpmixer.py diff --git a/deeplightning/models/mobilenetv2.py b/deeplightning/model/mobilenetv2.py similarity index 100% rename from deeplightning/models/mobilenetv2.py rename to deeplightning/model/mobilenetv2.py diff --git a/deeplightning/models/mobilenetv3.py b/deeplightning/model/mobilenetv3.py similarity index 100% rename from deeplightning/models/mobilenetv3.py rename to deeplightning/model/mobilenetv3.py diff --git a/deeplightning/models/resnet.py b/deeplightning/model/resnet.py similarity index 100% rename from deeplightning/models/resnet.py rename to deeplightning/model/resnet.py diff --git a/deeplightning/models/unet.py b/deeplightning/model/unet.py similarity index 100% rename from deeplightning/models/unet.py rename to deeplightning/model/unet.py diff --git a/deeplightning/models/vit.py b/deeplightning/model/vit.py similarity index 100% rename from deeplightning/models/vit.py rename to deeplightning/model/vit.py diff --git a/deeplightning/models/vit2.py b/deeplightning/model/vit2.py similarity index 100% rename from deeplightning/models/vit2.py rename to deeplightning/model/vit2.py diff --git a/deeplightning/models/vit_old.py b/deeplightning/model/vit_old.py similarity index 100% rename from deeplightning/models/vit_old.py rename to deeplightning/model/vit_old.py diff --git a/deeplightning/models/vqvae.py b/deeplightning/model/vqvae.py similarity index 100% rename from deeplightning/models/vqvae.py rename to deeplightning/model/vqvae.py diff --git a/deeplightning/models/__init__.py b/deeplightning/models/__init__.py deleted file mode 100755 index 0af42ee..0000000 --- a/deeplightning/models/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -from deeplightning.models.cnn import * -from deeplightning.models.convmixer import * -from deeplightning.models.dcgan import * -from deeplightning.models.gan import * -from deeplightning.models.lenet import * -from deeplightning.models.mlpmixer import * -from deeplightning.models.mobilenetv2 import * -from deeplightning.models.mobilenetv3 import * -from deeplightning.models.resnet import * -from deeplightning.models.unet import * -from deeplightning.models.vit import * -from deeplightning.models.vqvae import * diff --git a/deeplightning/task/specs.py b/deeplightning/task/specs.py new file mode 100644 index 0000000..1bf471b --- /dev/null +++ b/deeplightning/task/specs.py @@ -0,0 +1,34 @@ +from typing import Any +from omegaconf import OmegaConf +from deeplightning.registry import METRICS_REGISTRY, TASK_REGISTRY + + +__TASKS__ = [ + "ImageClassification", + "SemanticSegmentation", +] + + +#|TODO register metrics with decorator +__METRICS__ = {"classification_accuracy": None} + + +class TaskSpecification(): + def __init__(self, cfg: OmegaConf): + assert cfg.task in __TASKS__ + self.task = cfg.task + + +class ImageClassificationTask(TaskSpecification): + def __init__(self, cfg: OmegaConf): + super().__init__() + if cfg.task = + self.metrics = [ + "classification_accuracy", + ] + for m in self.metrics: + assert m in METRICS_REGISTRY.get_element_names() + + +def ImageClassificationSpec(cfg: OmegaConf) -> ImageClassificationTask: + return ImageClassificationTask(cfg) \ No newline at end of file diff --git a/deeplightning/trainer/metrics/__init__.py b/deeplightning/trainer/metrics/__init__.py new file mode 100644 index 0000000..c58acfa --- /dev/null +++ b/deeplightning/trainer/metrics/__init__.py @@ -0,0 +1 @@ +from deeplightning.trainer.metrics import * \ No newline at end of file diff --git a/deeplightning/utils/metrics.py b/deeplightning/utils/metrics.py index dc2b211..9aeb698 100755 --- a/deeplightning/utils/metrics.py +++ b/deeplightning/utils/metrics.py @@ -9,93 +9,132 @@ from torchmetrics.functional.classification.accuracy import accuracy import seaborn as sn import numpy as np -from matplotlib.figure import Figure as pltFigure +from matplotlib.figure import Figure from matplotlib import pyplot as plt +from deeplightning.registry import METRICS_REGISTRY -class Metric_PrecisionRecallCurve(MulticlassPrecisionRecallCurve): - """Precision-Recall metric class; inherits methods from - torchmetrics parent class. +__all__ = [ + "ClassificationAccuracy", "classification_accuracy", + "PrecisionRecallCurve", "precision_recall_curve", + "ConfusionMatrix", "confusion_matrix", +] + + +class ClassificationAccuracy(MulticlassAccuracy): + """Classification Accuracy metric, inheriting from torchmetrics + """ + def __init__(self, cfg: OmegaConf): + self.num_classes = cfg.model.network.params.num_classes + args = { + "num_classes": self.num_classes, + } + super().__init__(**args) + + +@METRICS_REGISTRY.register_element() +def classification_accuracy(cfg) -> ClassificationAccuracy: + return ClassificationAccuracy(cfg) + + +class PrecisionRecallCurve(MulticlassPrecisionRecallCurve): + """Precision-Recall metric class, inheriting from torchmetrics """ def __init__(self, cfg: OmegaConf): self.num_classes = cfg.model.network.params.num_classes args = { - "task": "binary" if self.num_classes == 2 else "multiclass", "num_classes": self.num_classes, - } + } super().__init__(**args) - - def draw(self, precision: Tensor, recall: Tensor, thresholds: Tensor, subset: str, epoch: int) -> pltFigure: - """Draw Confusion Matrix as a figure, to be logged as artifact media. - - Parameters - ---------- - precision : precisions. - recall : recalls. - thresholds: threshold - subset : data subset (e.g. 'train' or 'val), to be used - as a label in the figure. - epoch : current epoch, to be used as a label in the figure. + def draw(self, + precision: Tensor, + recall: Tensor, + thresholds: Tensor, + stage: str, + epoch: int + ) -> Figure: + """Draw Precision-Recall Curve as a figure, to be logged as artifact media + + Args: + precision: precisions values + recall: recalls values + thresholds: threshold values + stage: data subset {"train", "val", "test"}, for labelling + epoch: current epoch, for labelling """ - assert self.num_classes == len(precision) and self.num_classes == len(recall) + assert self.num_classes == len(precision) + assert self.num_classes == len(recall) + # draw figure fig = plt.figure() for i in range(self.num_classes): plt.plot(recall[i].cpu(), precision[i].cpu(), label=i) - plt.title(f"Precision-Recall Curve [{subset}, epoch {epoch}]") + plt.title(f"Precision-Recall Curve [{stage}, epoch {epoch}]") plt.xlabel("Recall") plt.ylabel("Precision") if self.num_classes <= 10: plt.legend(loc="lower left", title="class", fontsize='small') plt.close() return fig + - -class Metric_ConfusionMatrix(MulticlassConfusionMatrix): - """Confusion Matrix metric class; inherits methods from - torchmetrics parent class. +@METRICS_REGISTRY.register_element() +def precision_recall_curve(cfg) -> PrecisionRecallCurve: + return PrecisionRecallCurve(cfg) + + +class ConfusionMatrix(MulticlassConfusionMatrix): + """Confusion Matrix metric class, inheriting from torchmetrics """ def __init__(self, cfg: OmegaConf): self.num_classes = cfg.model.network.params.num_classes args = { - "task": "binary" if self.num_classes == 2 else "multiclass", "num_classes": self.num_classes, - "normalize": "true", # 'true' normalizes over the true labels (targets) + "normalize": "true", # 'true' normalizes over true labels (targets) } super().__init__(**args) - def draw(self, confusion_matrix: Tensor, subset: str, epoch: int) -> pltFigure: - """Draw Confusion Matrix as a figure, to be logged as artifact media. + def draw(self, + confusion_matrix: Tensor, + stage: str, + epoch: int, + ) -> Figure: + """Draw Confusion Matrix as a figure, to be logged as artifact media + + Args: + confusion_matrix: confusion matrix values + stage: data subset {"train", "val", "test"}, for labelling + epoch: current epoch, for labelling """ - assert self.num_classes == confusion_matrix.shape[0] and self.num_classes == confusion_matrix.shape[1] + assert self.num_classes == confusion_matrix.shape[0] + assert self.num_classes == confusion_matrix.shape[1] + + # round confusion matrix values confusion_matrix = np.round(100*confusion_matrix.cpu().numpy()).astype(int) + # draw figure fig = plt.subplot() - cbar_args = {"label": "Correct predictions (%), normalized by true class"} - sn.heatmap(data = confusion_matrix, annot = True, fmt = "g", square = True, - cmap = "Blues", vmin=0, vmax=100, cbar_kws=cbar_args) - plt.title(f"Confusion Matrix [{subset}, epoch {epoch}]") + cbar_args = { + "label": "Correct predictions (%), normalized by true class"} + sn.heatmap( + data = confusion_matrix, + annot=True, fmt="g", square=True, cmap="Blues", + vmin=0, vmax=100, cbar_kws=cbar_args) + plt.title(f"Confusion Matrix [{stage}, epoch {epoch}]") plt.xlabel("Predicted class") plt.ylabel("True class") plt.close() return fig - -class Metric_Accuracy(MulticlassAccuracy): - """Accuracy metric class; inherits methods from - torchmetrics parent class. - """ - def __init__(self, cfg: OmegaConf): - self.num_classes = cfg.model.network.params.num_classes - args = { - "task": "binary" if self.num_classes == 2 else "multiclass", - "num_classes": self.num_classes, - } - super().__init__(**args) - + +@METRICS_REGISTRY.register_element() +def confusion_matrix(cfg) -> ConfusionMatrix: + return ConfusionMatrix(cfg) + + def metric_accuracy(logits: Tensor, target: Tensor, task: str, num_classes: int) -> Tensor: preds = torch.argmax(logits, dim=1) From 3fbfb122b926d9ac4a9c25bb738956b6c9dc8b80 Mon Sep 17 00:00:00 2001 From: pme0 <12113751+pme0@users.noreply.github.com> Date: Fri, 27 Oct 2023 12:16:36 +0100 Subject: [PATCH 02/19] empty base config --- configs/SkinLesionSegmentation.yaml | 4 +- configs/_base.yaml | 69 ++++++++++++++++++++++++++++ configs/base.yaml | 71 ----------------------------- 3 files changed, 71 insertions(+), 73 deletions(-) create mode 100755 configs/_base.yaml delete mode 100755 configs/base.yaml diff --git a/configs/SkinLesionSegmentation.yaml b/configs/SkinLesionSegmentation.yaml index f6e8a2f..493df94 100755 --- a/configs/SkinLesionSegmentation.yaml +++ b/configs/SkinLesionSegmentation.yaml @@ -1,9 +1,9 @@ +task: SemanticSegmentation + modes: train: true test: false -task: SemanticSegmentation - data: dataset: HAM10000 root: /Users/pme/research/data/HAM10000 diff --git a/configs/_base.yaml b/configs/_base.yaml new file mode 100755 index 0000000..e70f97b --- /dev/null +++ b/configs/_base.yaml @@ -0,0 +1,69 @@ +task: + +modes: + train: + test: + +data: + dataset: + root: + num_workers: + batch_size: + module: + target: + train_transforms: + test_transforms: + +model: + module: + target: + network: + target: + params: + optimizer: + target: + params: + scheduler: + target: + params: + call: + interval: + frequency: + loss: + target: + params: + +engine: + accelerator: + strategy: + devices: + num_nodes: + precision: + +metrics: + train: + val: + test: + +train: + num_epochs: + val_every_n_epoch: + grad_accum_from_epoch: + grad_accum_every_n_batches: + ckpt_resume_path: + ckpt_monitor_metric: + ckpt_every_n_epochs: + ckpt_save_top_k: + early_stop_metric: + early_stop_delta: + early_stop_patience: + +test: + ckpt_test_path: + +logger: + name: + project_name: + tags: + notes: + log_every_n_steps: \ No newline at end of file diff --git a/configs/base.yaml b/configs/base.yaml deleted file mode 100755 index 55f8d5b..0000000 --- a/configs/base.yaml +++ /dev/null @@ -1,71 +0,0 @@ -modes: - train: true - test: false - -task: ImageClassification - -data: - root: /Users/pme/data/ - dataset: MNIST - image_size: 28 - num_channels: 1 - num_classes: 10 - num_workers: 4 - batch_size: 256 - module: - target: deeplightning.data.dataloaders.image.mnist.MNIST - -model: - module: - target: deeplightning.task.image.classification.TaskModule - network: - #target: deeplightning.models.cnn.SymbolCNN - target: deeplightning.models.mobilenetv3.mobilenet_v3_small - params: - #num_classes: 10 - #num_channels: 1 - num_classes: 10 - num_channels: 1 - optimizer: - target: torch.optim.SGD - params: - lr: 0.01 - weight_decay: 0.01 - momentum: 0.9 - scheduler: - target: torch.optim.lr_scheduler.ExponentialLR - params: - gamma: 0.99 - call: - interval: "epoch" - frequency: 1 - loss: - target: torch.nn.CrossEntropyLoss - params: - -engine: - accelerator: cpu # {cpu,gpu} - strategy: auto # {auto, ddp, deepspeed} - devices: 1 # {1, [0,1]} - num_nodes: 1 - precision: 32 - -train: - num_epochs: 1 - val_every_n_epoch: 1 - grad_accum_from_epoch: 0 - grad_accum_every_n_batches: 1 - ckpt_resume_path: null - ckpt_monitor_metric: val_acc # used in `ModelCheckpoint` callback - ckpt_every_n_epochs: 1 - ckpt_save_top_k: 1 - early_stop_metric: null # used in `EarlyStopping` callback - early_stop_delta: 0.001 - early_stop_patience: 3 - -logger: - name: wandb - project_name: trial - tags: ["_"] # cannot be empty - notes: null - log_every_n_steps: 20 \ No newline at end of file From 85fc6ebcb0bf2f18c38708e177dadc18d39f0e70 Mon Sep 17 00:00:00 2001 From: pme0 <12113751+pme0@users.noreply.github.com> Date: Sun, 29 Oct 2023 01:46:26 +0100 Subject: [PATCH 03/19] refactor tests --- conda_env.yaml | 13 ++++---- condaenv.sqtapdy1.requirements.txt | 21 +++++++++++++ configs/ImageReconstruction_vqvae.yaml | 10 ++----- .../dummy_config.yaml => configs/_dummy.yaml | 26 ++++++++++------ deeplightning/config/load.py | 4 +-- deeplightning/init/initializers.py | 3 +- deeplightning/registry.py | 15 +++++----- deeplightning/task/vision/classification.py | 2 +- deeplightning/trainer/trainer.py | 4 +-- tests/__init__.py | 8 +++++ tests/run_all_tests.sh | 6 ++-- tests/test_checkpoint.py | 30 +++++++++++++------ tests/test_configs.py | 20 +++++++++++++ tests/test_trainer.py | 30 +++++-------------- 14 files changed, 122 insertions(+), 70 deletions(-) create mode 100644 condaenv.sqtapdy1.requirements.txt rename tests/helpers/dummy_config.yaml => configs/_dummy.yaml (71%) create mode 100644 tests/__init__.py create mode 100644 tests/test_configs.py diff --git a/conda_env.yaml b/conda_env.yaml index 309facc..0f0debd 100755 --- a/conda_env.yaml +++ b/conda_env.yaml @@ -3,8 +3,8 @@ channels: - defaults - conda-forge dependencies: - - pip==23.0.1 - - python==3.10 # numpy==1.22 not working with python==3.9 + - pip + - python==3.10 - pip: - colorama==0.4.4 - deepspeed==0.5.10 @@ -17,13 +17,14 @@ dependencies: - matplotlib==3.5.1 - numpy==1.23.5 - omegaconf==2.1.1 - - opencv-python==4.7.0.72 #==4.1.2.30 + - opencv-python==4.7.0.72 - pandas==1.5.3 + - pytest - pyyaml==6.0 - seaborn==0.12.0 - - torch==2.0.0 #==1.13.1 + - torch==2.0.0 - torchaudio==2.0.1 - torchlibrosa==0.1.0 - - torchmetrics==0.11.4 #0.11.0 - - torchvision==0.15.1 #0.11.3 + - torchmetrics==1.2.0 + - torchvision==0.15.1 - wandb==0.12.21 \ No newline at end of file diff --git a/condaenv.sqtapdy1.requirements.txt b/condaenv.sqtapdy1.requirements.txt new file mode 100644 index 0000000..9fb1cdc --- /dev/null +++ b/condaenv.sqtapdy1.requirements.txt @@ -0,0 +1,21 @@ +colorama==0.4.4 +deepspeed==0.5.10 +einops==0.4.0 +flask==2.0.3 +imagesize==1.4.1 +ipython +librosa==0.9.2 +lightning==2.0.0 +matplotlib==3.5.1 +numpy==1.23.5 +omegaconf==2.1.1 +opencv-python==4.7.0.72 +pandas==1.5.3 +pyyaml==6.0 +seaborn==0.12.0 +torch==2.0.0 +torchaudio==2.0.1 +torchlibrosa==0.1.0 +torchmetrics==0.11.4 +torchvision==0.15.1 +wandb==0.12.21 \ No newline at end of file diff --git a/configs/ImageReconstruction_vqvae.yaml b/configs/ImageReconstruction_vqvae.yaml index 08f34c2..c4579bd 100755 --- a/configs/ImageReconstruction_vqvae.yaml +++ b/configs/ImageReconstruction_vqvae.yaml @@ -39,8 +39,8 @@ model: target: deeplightning.modules.loss.vqvae_loss.VQVAE_Loss params: smooth_l1_loss: True - num_tokens: ${model.network.params.num_tokens} - kl_div_loss_weight: ${model.network.params.kl_div_loss_weight} + num_tokens: 8 # ${model.network.params.num_tokens} + kl_div_loss_weight: 0.0 # ${model.network.params.kl_div_loss_weight} engine: backend: deepspeed_stage_3 @@ -64,8 +64,4 @@ log_to_wandb: true project_name: trial tags: ["image", "reconstruction", "vqvae"] # cannot be empty notes: null - type: pytorch_lightning.loggers.MLFlowLogger - params: - experiment_name: Default - tracking_uri: mlruns - log_every_n_steps: 10 \ No newline at end of file +log_every_n_steps: 20 \ No newline at end of file diff --git a/tests/helpers/dummy_config.yaml b/configs/_dummy.yaml similarity index 71% rename from tests/helpers/dummy_config.yaml rename to configs/_dummy.yaml index 3d7860b..ffdfc28 100755 --- a/tests/helpers/dummy_config.yaml +++ b/configs/_dummy.yaml @@ -36,25 +36,33 @@ model: params: engine: - backend: deepspeed_stage_3 - gpus: null + accelerator: cpu + strategy: auto + devices: auto num_nodes: 1 precision: 32 +metrics: + train: default + val: default + test: default + train: num_epochs: 1 val_every_n_epoch: 1 grad_accum_from_epoch: 0 grad_accum_every_n_batches: 1 ckpt_resume_path: null + ckpt_monitor_metric: null # used in `ModelCheckpoint` callback ckpt_every_n_epochs: 1 + ckpt_save_top_k: 1 early_stop_metric: null + early_stop_delta: 0.001 + early_stop_patience: 3 logger: - log_to_wandb: false - target: pytorch_lightning.loggers.MLFlowLogger - params: - experiment_name: Default - tracking_uri: mlruns - log_every_n_steps: 10 - flush_logs_every_n_steps: 50 \ No newline at end of file + name: wandb + project_name: unittests + tags: ["_"] # cannot be empty + notes: null + log_every_n_steps: 20 \ No newline at end of file diff --git a/deeplightning/config/load.py b/deeplightning/config/load.py index d64ed2f..c304826 100755 --- a/deeplightning/config/load.py +++ b/deeplightning/config/load.py @@ -21,7 +21,7 @@ def load_config(config_file: str = "configs/base.yaml") -> OmegaConf: """ cfg = OmegaConf.load(config_file) cfg = merge_defaults(cfg) - cfg = check_consistency(cfg) + #cfg = check_consistency(cfg) cfg = runtime_compute(cfg) OmegaConf.resolve(cfg) #config_print(OmegaConf.to_yaml(cfg)) @@ -103,4 +103,4 @@ def log_config(cfg: OmegaConf, path: str) -> None: if not os.path.exists(path): os.makedirs(path, exist_ok=True) - OmegaConf.save(cfg, f = os.path.join(path, "cfg.yaml")) \ No newline at end of file + OmegaConf.save(cfg, f=os.path.join(path, "cfg.yaml")) \ No newline at end of file diff --git a/deeplightning/init/initializers.py b/deeplightning/init/initializers.py index d8cbf15..dfa916c 100755 --- a/deeplightning/init/initializers.py +++ b/deeplightning/init/initializers.py @@ -7,7 +7,7 @@ from deeplightning.config.defaults import __ConfigGroups__ from deeplightning.init.imports import init_module from deeplightning.trainer.trainer import DLTrainer -from deeplightning.registry import __MetricsRegistry__ +#from deeplightning.registry import __MetricsRegistry__ @@ -53,4 +53,5 @@ def init_everything(cfg: OmegaConf) -> Tuple[LightningModule, LightningDataModul def init_metrics(cfg: OmegaConf, device: torch.device) -> dict: """ Initialize performance metrics """ + raise NotImplementedError return {k: v(cfg).to(device) for k, v in __MetricsRegistry__[cfg.task].items()} \ No newline at end of file diff --git a/deeplightning/registry.py b/deeplightning/registry.py index e21f972..6009bde 100755 --- a/deeplightning/registry.py +++ b/deeplightning/registry.py @@ -1,4 +1,4 @@ -#from lightning.pytorch.loggers import WandbLogger +from lightning.pytorch.loggers import WandbLogger #from deeplightning.logger.wandb import wandbLogger """ @@ -95,7 +95,7 @@ def get_element_names(self) -> List: -''' + __TaskRegistry__ = [ # Image "ImageClassification", @@ -106,6 +106,12 @@ def get_element_names(self) -> List: "AudioClassification", ] +__LoggerRegistry__ = { + "wandb": WandbLogger, +} + + +''' __HooksRegistry__ = { # Image "ImageClassification": { @@ -181,9 +187,4 @@ def get_element_names(self) -> List: }, } - - -__LoggerRegistry__ = { - "wandb": WandbLogger, -} ''' \ No newline at end of file diff --git a/deeplightning/task/vision/classification.py b/deeplightning/task/vision/classification.py index 8f5eaa3..6f73226 100755 --- a/deeplightning/task/vision/classification.py +++ b/deeplightning/task/vision/classification.py @@ -48,7 +48,7 @@ def __init__(self, cfg: OmegaConf): # Initialise metrics to track during training torch_device = torch.device("cuda") if cfg.engine.accelerator == "gpu" else torch.device('cpu') - self.metrics = init_metrics(cfg, device=torch_device) + #self.metrics = init_metrics(cfg, device=torch_device) # Initialise label to track metrics against self.step_label = "iteration" diff --git a/deeplightning/trainer/trainer.py b/deeplightning/trainer/trainer.py index 04566db..019c3ac 100755 --- a/deeplightning/trainer/trainer.py +++ b/deeplightning/trainer/trainer.py @@ -16,8 +16,6 @@ from deeplightning.logger.helpers import add_logger_params_to_config from deeplightning.logger.wandb import init_wandb_metrics from deeplightning.utils.messages import config_print -from deeplightning.registry import (__LoggerRegistry__, - __HooksRegistry__) from deeplightning.utils.python_utils import flatten_dict @@ -84,7 +82,7 @@ def init_logger(self, cfg: OmegaConf) -> None: # intialize step label for each metrics logger.step_label = init_wandb_metrics( - metric_names = __HooksRegistry__[cfg.task]["LOGGED_METRICS_NAMES"], + metric_names = [f"{x}_{y}" for x in cfg.metrics for y in cfg.metrics[x]], #__HooksRegistry__[cfg.task]["LOGGED_METRICS_NAMES"], step_label = "iteration", ) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..a6042fc --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,8 @@ +import os +import sys + + +PROJECT_PATH = os.getcwd() +SOURCE_PATH = os.path.join(PROJECT_PATH, "deeplightning") + +sys.path.append(SOURCE_PATH) \ No newline at end of file diff --git a/tests/run_all_tests.sh b/tests/run_all_tests.sh index 82d0be3..585bc28 100755 --- a/tests/run_all_tests.sh +++ b/tests/run_all_tests.sh @@ -2,8 +2,10 @@ # before running the script make sure to activate `deeplightning` # environment and install `pytest` library -#conda activate deeplightning -#pip install pytest +# ``` +# conda activate deeplightning +# pip install pytest +# ```` # run `pytest --help` to see argument options diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py index f355280..fa64667 100755 --- a/tests/test_checkpoint.py +++ b/tests/test_checkpoint.py @@ -3,7 +3,6 @@ sys.path.insert(0, "..") import torch import lightning as pl -from lightning.utilities import rank_zero_only import pytest from tests.helpers.tools import compare_model_params @@ -16,9 +15,9 @@ CKPT_PATH = os.path.join(TMP_DIR, "last.ckpt") -def setup_trainer(strategy, precision, gpus): - - ckpt_callback = pl.callbacks.ModelCheckpoint( +def setup_trainer(accelerator, strategy, devices, precision): + + ckpt_callback = pl.pytorch.callbacks.ModelCheckpoint( dirpath = TMP_DIR, save_last = True, every_n_epochs = 1) @@ -26,9 +25,10 @@ def setup_trainer(strategy, precision, gpus): trainer = pl.Trainer( max_epochs = 1, logger = False, + accelerator = accelerator, strategy = strategy, + devices = devices, precision = precision, - gpus = gpus, limit_train_batches = 2, limit_val_batches = 2, enable_model_summary = False, @@ -43,14 +43,26 @@ def setup_trainer(strategy, precision, gpus): "kwargs", ( pytest.param( - dict(strategy = None, precision = 32, gpus = None)), + dict(accelerator="cpu", strategy="auto", devices="auto", precision=16)), + pytest.param( + dict(accelerator="cpu", strategy="auto", devices="auto", precision=32)), + pytest.param( + dict(accelerator="gpu", strategy="auto", devices=[0], precision=16), + marks = pytest.mark.skipif( + condition = not torch.cuda.is_available(), + reason="gpu unavailable")), pytest.param( - dict(strategy = "ddp", precision = 32, gpus = [0]), + dict(accelerator="gpu", strategy="auto", devices=[0], precision=32), marks = pytest.mark.skipif( condition = not torch.cuda.is_available(), - reason="single-gpu unavailable")), + reason="gpu unavailable")), + pytest.param( + dict(accelerator="gpu", strategy="auto", devices=[0,1], precision=16), + marks = pytest.mark.skipif( + condition = torch.cuda.device_count() < 2, + reason="multi-gpu unavailable")), pytest.param( - dict(strategy = "ddp", precision = 32, gpus = [0,1]), + dict(accelerator="gpu", strategy="auto", devices=[0,1], precision=32), marks = pytest.mark.skipif( condition = torch.cuda.device_count() < 2, reason="multi-gpu unavailable")), diff --git a/tests/test_configs.py b/tests/test_configs.py new file mode 100644 index 0000000..b5ab7f8 --- /dev/null +++ b/tests/test_configs.py @@ -0,0 +1,20 @@ +import os +import sys +sys.path.insert(0, "..") +from omegaconf import OmegaConf +import pytest + +from deeplightning.config.load import load_config + + +def test_configs(): + + cfg_base = load_config(config_file="configs/_base.yaml") + assert OmegaConf.is_config(cfg_base) + + for cfg_filename in os.listdir("configs"): + if cfg_filename != "_base.yaml": + cfg = load_config(config_file = f"configs/{cfg_filename}") + assert OmegaConf.is_config(cfg) + + \ No newline at end of file diff --git a/tests/test_trainer.py b/tests/test_trainer.py index b729f31..76035ae 100755 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -20,23 +20,14 @@ "kwargs", ( pytest.param( - dict( - strategy = None, - precision = 32, - gpus = None)), + dict(accelerator="cpu", strategy="auto", devices="auto", precision=32)), pytest.param( - dict( - strategy = "ddp", - precision = 32, - gpus = [0]), + dict(accelerator="gpu", strategy="ddp", devices="auto", precision=32), marks = pytest.mark.skipif( condition = not torch.cuda.is_available(), - reason="single-gpu unavailable")), + reason="gpu unavailable")), pytest.param( - dict( - strategy = "ddp", - precision = 32, - gpus = [0,1]), + dict(accelerator="gpu", strategy="ddp", precision=32), marks = pytest.mark.skipif( condition = torch.cuda.device_count() < 2, reason="multi-gpu unavailable")), @@ -44,19 +35,12 @@ ) def test_trainer(kwargs): - cfg = load_config(config_file = "helpers/dummy_config.yaml") + cfg = load_config(config_file = "configs/_dummy.yaml") + cfg.engine.accelerator = kwargs["accelerator"] cfg.engine.strategy = kwargs["strategy"] cfg.engine.precision = kwargs["precision"] - cfg.engine.devices = kwargs["gpus"] - # TODO extra params for quick testing - ''' - cfg.test_params.limit_train_batches = 2 - cfg.test_params.limit_val_batches = 2 - cfg.test_params.enable_model_summary = False, - cfg.test_params.enable_progress_bar = False, - cfg.test_params.logger = False - ''' + cfg.engine.devices = kwargs["devices"] model = init_model(cfg) trainer = init_trainer(cfg) From 9d6cc8a1f5bba3f3adf39a386d5bf13e385f17b0 Mon Sep 17 00:00:00 2001 From: pme0 <12113751+pme0@users.noreply.github.com> Date: Sun, 29 Oct 2023 22:53:16 +0000 Subject: [PATCH 04/19] cfg refactor --- configs/AudioClassification_lstm.yaml | 73 --------------- configs/ImageClassification_cnn.yaml | 69 -------------- .../ImageClassification_cnn_TaskAgnostic.yaml | 67 -------------- configs/ImageClassification_resnet.yaml | 62 ------------- configs/ImageGeneration_gan.yaml | 76 ---------------- configs/ImageReconstruction_vqvae.yaml | 67 -------------- configs/ObjectRecognition_vit.yaml | 91 ------------------- configs/PedestrianDetection_yolo.yaml | 10 -- configs/SpokenWordRecognition_cnn.yaml | 79 ---------------- configs/SymbolRecognition_cnn.yaml | 67 -------------- configs/SymbolRecognition_mlpmixer.yaml | 72 --------------- configs/SymbolRecognition_vit.yaml | 74 --------------- {configs => tests/helpers}/_dummy.yaml | 0 tests/test_configs.py | 34 +++++-- tests/test_trainer.py | 2 +- 15 files changed, 28 insertions(+), 815 deletions(-) delete mode 100755 configs/AudioClassification_lstm.yaml delete mode 100755 configs/ImageClassification_cnn.yaml delete mode 100755 configs/ImageClassification_cnn_TaskAgnostic.yaml delete mode 100755 configs/ImageClassification_resnet.yaml delete mode 100755 configs/ImageGeneration_gan.yaml delete mode 100755 configs/ImageReconstruction_vqvae.yaml delete mode 100755 configs/ObjectRecognition_vit.yaml delete mode 100755 configs/PedestrianDetection_yolo.yaml delete mode 100755 configs/SpokenWordRecognition_cnn.yaml delete mode 100755 configs/SymbolRecognition_cnn.yaml delete mode 100755 configs/SymbolRecognition_mlpmixer.yaml delete mode 100755 configs/SymbolRecognition_vit.yaml rename {configs => tests/helpers}/_dummy.yaml (100%) diff --git a/configs/AudioClassification_lstm.yaml b/configs/AudioClassification_lstm.yaml deleted file mode 100755 index 70c0c2b..0000000 --- a/configs/AudioClassification_lstm.yaml +++ /dev/null @@ -1,73 +0,0 @@ -modes: - train: true - test: false - -task: ImageClassification -data: - dataset: FSD - root: /Users/pme/data/fsd - num_workers: 4 - batch_size: 256 - module: - target: deeplightning.data.dataloaders.audio.fsd.FreeSpokenDigit - train_transforms: - normalize: # use `deeplightning.utils.data.compute_dataset_mean_and_stdev()` - mean: [0.4711] - stdev: [0.1464] - test_transforms: - normalize: - mean: [0.4711] - stdev: [0.1464] - -model: - module: - target: deeplightning.task.audio.audio_classif.AudioClassification - network: - target: deeplightning.models.lstm.LSTM - params: - num_classes: ..... - num_channels: ..... - optimizer: - target: torch.optim.Adadelta - params: - scheduler: - target: torch.optim.lr_scheduler.ExponentialLR - params: - gamma: 0.99 - call: - interval: "epoch" - frequency: 1 - loss: - target: torch.nn.CrossEntropyLoss - params: - -engine: - backend: ddp - gpus: [0] - num_nodes: 1 - precision: 32 - -train: - num_epochs: 5 - val_every_n_epoch: 1 - grad_accum_from_epoch: 0 - grad_accum_every_n_batches: 1 - ckpt_resume_path: null - ckpt_every_n_epochs: 1 - early_stop_metric: null - early_stop_delta: 0.001 - early_stop_patience: 3 - -test: - ckpt_test_path: /PATH_TO_CKPT # used only when `modes.test=True` - -logger: - log_to_wandb: true - project_name: AudioClassification_FSD_LSTM - tags: ["audio", "classification", "lstm"] # cannot be empty - notes: null - type: pytorch_lightning.loggers.MLFlowLogger - params: - experiment_name: Default - tracking_uri: mlruns - log_every_n_steps: 10 \ No newline at end of file diff --git a/configs/ImageClassification_cnn.yaml b/configs/ImageClassification_cnn.yaml deleted file mode 100755 index 072105e..0000000 --- a/configs/ImageClassification_cnn.yaml +++ /dev/null @@ -1,69 +0,0 @@ -modes: - train: true - test: true - -task: ImageClassification -data: - dataset: MNIST - root: ~/data/MNIST - num_workers: 4 - batch_size: 256 - module: - target: deeplightning.data.dataloaders.image.mnist.MNIST - image_size: 28 - num_channels: 1 - num_classes: 10 - -model: - module: - target: deeplightning.task.image.classification.ImageClassification - network: - target: deeplightning.models.cnn.SymbolCNN - params: - num_classes: 10 - num_channels: 1 - optimizer: - target: torch.optim.Adadelta - params: - scheduler: - target: torch.optim.lr_scheduler.ExponentialLR - params: - gamma: 0.99 - call: - interval: "epoch" - frequency: 1 - loss: - target: torch.nn.CrossEntropyLoss - params: - -engine: - backend: ddp - gpus: [0] - num_nodes: 1 - precision: 32 - -train: - num_epochs: 2 - val_every_n_epoch: 1 - grad_accum_from_epoch: 0 - grad_accum_every_n_batches: 1 - ckpt_resume_path: null - ckpt_every_n_epochs: 1 - early_stop_metric: null - early_stop_delta: 0.001 - early_stop_patience: 3 - -test: - ckpt_test_path: /PATH_TO_CKPT # used only when `modes.test=True` - -logger: - name: wandb - log_to_wandb: true - project_name: trial - tags: ["image", "classification", "cnn"] # cannot be empty - notes: null - type: pytorch_lightning.loggers.MLFlowLogger - params: - experiment_name: Default - tracking_uri: mlruns - log_every_n_steps: 10 \ No newline at end of file diff --git a/configs/ImageClassification_cnn_TaskAgnostic.yaml b/configs/ImageClassification_cnn_TaskAgnostic.yaml deleted file mode 100755 index c7e365e..0000000 --- a/configs/ImageClassification_cnn_TaskAgnostic.yaml +++ /dev/null @@ -1,67 +0,0 @@ -modes: - train: true - test: true - -task: ImageClassification -data: - dataset: MNIST - root: ~/data/MNIST - num_workers: 4 - batch_size: 256 - module: - target: deeplightning.data.dataloaders.image.mnist.MNIST - image_size: 28 - num_channels: 1 - num_classes: 10 - -model: - module: - target: deeplightning.task.image.classification.TaskModule - network: - target: deeplightning.models.cnn.SymbolCNN - params: - num_classes: 10 - num_channels: 1 - optimizer: - target: torch.optim.Adadelta - params: - scheduler: - target: torch.optim.lr_scheduler.ExponentialLR - params: - gamma: 0.99 - call: - interval: "epoch" - frequency: 1 - loss: - target: torch.nn.CrossEntropyLoss - params: - -engine: - accelerator: cpu - strategy: ddp - devices: 1 - num_nodes: 1 - precision: 32 - -train: - num_epochs: 1 - val_every_n_epoch: 1 - grad_accum_from_epoch: 0 - grad_accum_every_n_batches: 32 - ckpt_resume_path: null - ckpt_monitor_metric: val_acc # used in `ModelCheckpoint` callback - ckpt_every_n_epochs: 1 - ckpt_save_top_k: 1 - early_stop_metric: null - early_stop_delta: 0.001 - early_stop_patience: 3 - -test: - ckpt_test_path: /PATH_TO_CKPT # used only when `modes.test=True` - -logger: - name: wandb - project_name: trial - tags: ["image", "classification", "cnn"] # cannot be empty - notes: null - log_every_n_steps: 20 \ No newline at end of file diff --git a/configs/ImageClassification_resnet.yaml b/configs/ImageClassification_resnet.yaml deleted file mode 100755 index 6dddc0e..0000000 --- a/configs/ImageClassification_resnet.yaml +++ /dev/null @@ -1,62 +0,0 @@ -modes: - train: true - test: true - -task: classification - -data: - root: /Users/pme12/data/ - num_workers: 4 - batch_size: 256 - module: - target: deeplightning.data.dataloaders.mnist.MNIST - -model: - module: - target: deeplightning.task.image.classification.ImageClassification - network: - target: deeplightning.models.resnet.resnet18 - params: - num_classes: 10 - num_channels: 1 - optimizer: - target: torch.optim.Adadelta - params: - scheduler: - target: torch.optim.lr_scheduler.ExponentialLR - params: - gamma: 0.99 - call: - interval: "epoch" - frequency: 1 - loss: - target: torch.nn.CrossEntropyLoss - params: - -engine: - backend: deepspeed_stage_3 - gpus: null - num_nodes: 1 - precision: 32 - -train: - num_epochs: 2 - val_every_n_epoch: 1 - grad_accum_from_epoch: 0 - grad_accum_every_n_batches: 1 - ckpt_resume_path: null - ckpt_every_n_epochs: 1 - early_stop_metric: null - early_stop_delta: 0.001 - early_stop_patience: 3 - -logger: - log_to_wandb: true - project_name: trial - tags: ["image", "classification", "resnet"] # cannot be empty - notes: null - type: pytorch_lightning.loggers.MLFlowLogger - params: - experiment_name: Default - tracking_uri: mlruns - log_every_n_steps: 10 \ No newline at end of file diff --git a/configs/ImageGeneration_gan.yaml b/configs/ImageGeneration_gan.yaml deleted file mode 100755 index 036b0a1..0000000 --- a/configs/ImageGeneration_gan.yaml +++ /dev/null @@ -1,76 +0,0 @@ -task: reconstruction -data: - root: /Users/pme12/data/ - dataset: MNIST - resize: 32 # VQVAE requires image size to be power of 2 - num_workers: 4 - batch_size: 128 - module: - target: deeplightning.data.dataloaders.mnist.MNIST - -model: - module: - target: deeplightning.task.image.reconstructionVAE.ImageReconstructionGAN - network: - target: deeplightning.models.dcgan.DCGAN - params: - image_size: 28 - channels: 1 - latent_dim: 100 - optimizer: - discriminator: - target: torch.optim.Adam - params: - lr: 0.001 - steps: 1 - generator: - target: torch.optim.Adam - params: - lr: 0.001 - steps: 1 - scheduler: - discriminator: - target: torch.optim.lr_scheduler.ExponentialLR - params: - gamma: 0.99 - call: - interval: "epoch" - frequency: 1 - generator: - target: torch.optim.lr_scheduler.ExponentialLR - params: - gamma: 0.99 - call: - interval: "epoch" - frequency: 1 - loss: - target: torch.nn.BCELoss # BCEWithLogitsLoss - params: - -engine: - backend: deepspeed_stage_3 - gpus: null - num_nodes: 1 - precision: 32 - -train: - num_epochs: 10 - val_every_n_epoch: 1 - grad_accum_from_epoch: 0 - grad_accum_every_n_batches: 1 - ckpt_resume_path: null - ckpt_every_n_epochs: 1 - early_stop_metric: null - early_stop_delta: 0.001 - early_stop_patience: 3 - -logger: - log_to_wandb: true - project_name: trial - tags: ["image", "generation", "gan"] # cannot be empty - notes: null - type: pytorch_lightning.loggers.MLFlowLogger - params: - experiment_name: Default - tracking_uri: mlruns - log_every_n_steps: 10 \ No newline at end of file diff --git a/configs/ImageReconstruction_vqvae.yaml b/configs/ImageReconstruction_vqvae.yaml deleted file mode 100755 index c4579bd..0000000 --- a/configs/ImageReconstruction_vqvae.yaml +++ /dev/null @@ -1,67 +0,0 @@ -task: reconstruction -data: - root: /Users/pme12/data/ - dataset: MNIST - resize: 32 # VQVAE requires image size to be power of 2 - num_workers: 4 - batch_size: 128 - module: - target: deeplightning.data.dataloaders.mnist.MNIST - -model: - module: - target: deeplightning.task.image.reconstructionVAE.ImageReconstructionVAE - network: - target: deeplightning.models.vqvae.DiscreteVAE - params: - image_size: 32 - num_tokens: 8 #512 - codebook_dim: 32 #256 - num_layers: 1 #2 - num_resnet_blocks: 1 #2 - hidden_dim: 32 #128 - channels: 1 - temperature: 0.9 - straight_through: False - kl_div_loss_weight: 0.0 - normalization: null - optimizer: - target: torch.optim.Adadelta - params: - scheduler: - target: torch.optim.lr_scheduler.ExponentialLR - params: - gamma: 0.99 - call: - interval: "epoch" - frequency: 1 - loss: - target: deeplightning.modules.loss.vqvae_loss.VQVAE_Loss - params: - smooth_l1_loss: True - num_tokens: 8 # ${model.network.params.num_tokens} - kl_div_loss_weight: 0.0 # ${model.network.params.kl_div_loss_weight} - -engine: - backend: deepspeed_stage_3 - gpus: null - num_nodes: 1 - precision: 32 - -train: - num_epochs: 10 - val_every_n_epoch: 1 - grad_accum_from_epoch: 0 - grad_accum_every_n_batches: 1 - ckpt_resume_path: null - ckpt_every_n_epochs: 1 - early_stop_metric: null - early_stop_delta: 0.001 - early_stop_patience: 3 - -logger: -log_to_wandb: true -project_name: trial -tags: ["image", "reconstruction", "vqvae"] # cannot be empty -notes: null -log_every_n_steps: 20 \ No newline at end of file diff --git a/configs/ObjectRecognition_vit.yaml b/configs/ObjectRecognition_vit.yaml deleted file mode 100755 index 361ba6a..0000000 --- a/configs/ObjectRecognition_vit.yaml +++ /dev/null @@ -1,91 +0,0 @@ -modes: - train: true - test: true - -task: classification - -data: - root: /Users/pme/data/CIFAR10 - num_workers: 4 - batch_size: 256 - module: - target: deeplightning.data.dataloaders.image.cifar10.CIFAR10 - # transformations/augmentations - train_transforms: - crop: - size: 32 - pad: 4 - hflip: 0.5 - normalize: - mean: [0.4914, 0.4822, 0.4465] - std: [0.2023, 0.1994, 0.2010] - test_transforms: - normalize: - mean: [0.4914, 0.4822, 0.4465] - std: [0.2023, 0.1994, 0.2010] - # the following may be required inputs for model/lr_scheduler/etc - image_size: [32, 32] - num_channels: 3 - num_classes: 10 - num_training_samples: 45000 - -model: - module: - target: deeplightning.task.image.classification.ImageClassification - network: - target: deeplightning.models.vit.VisionTransformer - params: - image_size: "${data.image_size}" - num_classes: "${data.num_classes}" - num_channels: "${data.num_channels}" - embed_dim: 128 - mlp_dim: 512 - num_heads: 8 - num_layers: 6 - patch_size: 4 - dropout: 0.2 - optimizer: - target: torch.optim.Adam - params: - lr: 0.001 - scheduler: - #target: torch.optim.lr_scheduler.ExponentialLR - target: torch.optim.lr_scheduler.CosineAnnealingLR - params: - #gamma: 0.99 - eta_min: 0.0001 - T_max: "AUTO" # will be computed at runtime - call: - interval: "step" - frequency: 1 - loss: - target: torch.nn.CrossEntropyLoss - params: - -engine: - backend: deepspeed_stage_3 - gpus: null - num_nodes: 1 - precision: 32 - -train: - num_epochs: 10 - val_every_n_epoch: 1 - grad_accum_from_epoch: 0 - grad_accum_every_n_batches: 1 - ckpt_resume_path: null - ckpt_every_n_epochs: 1 - early_stop_metric: null - early_stop_delta: 0.001 - early_stop_patience: 3 - -logger: - log_to_wandb: true - project_name: ObjectRecognition - tags: ["image", "classification", "vit"] # cannot be empty - notes: null - log_every_n_steps: 10 - type: pytorch_lightning.loggers.MLFlowLogger - params: - experiment_name: Default - tracking_uri: mlruns \ No newline at end of file diff --git a/configs/PedestrianDetection_yolo.yaml b/configs/PedestrianDetection_yolo.yaml deleted file mode 100755 index e1ae6fd..0000000 --- a/configs/PedestrianDetection_yolo.yaml +++ /dev/null @@ -1,10 +0,0 @@ -task: detection -model_cfg: "external/yolov5/models/yolov5x.yaml" -model_ckpt: "/Users/pme/code/yolov5/yolov5x6.pt" -input_path: "/Users/pme/Downloads/people.jpeg" -save_path: "/Users/pme/Downloads/tests/" - -engine: - backend: null - gpus: null - num_nodes: 1 \ No newline at end of file diff --git a/configs/SpokenWordRecognition_cnn.yaml b/configs/SpokenWordRecognition_cnn.yaml deleted file mode 100755 index fb64364..0000000 --- a/configs/SpokenWordRecognition_cnn.yaml +++ /dev/null @@ -1,79 +0,0 @@ -modes: - train: true - test: true - inference: false - -task: classification - -data: - root: /Users/pme/data/FSD - num_workers: 4 - batch_size: 256 - module: - target: deeplightning.data.dataloaders.audio.fsd.FreeSpokenDigit - train_transforms: - normalize: # use `deeplightning.utils.data.compute_dataset_mean_and_stdev()` - mean: [0.4711] - std: [0.1464] - test_transforms: - normalize: - mean: [0.4711] - std: [0.1464] - # the following are required as model inputs in some cases - image_size: 64 - num_channels: 1 - num_classes: 10 - -model: - module: - target: deeplightning.task.image.classification.ImageClassification - network: - target: deeplightning.models.cnn.SpectrogramCNN - params: - num_classes: 10 - num_channels: 1 - optimizer: - target: torch.optim.Adam - params: - lr: 0.001 - scheduler: - target: torch.optim.lr_scheduler.ExponentialLR - params: - gamma: 0.99 - call: - interval: "epoch" - frequency: 1 - loss: - target: torch.nn.CrossEntropyLoss - params: - -engine: - backend: ddp - gpus: [0] - num_nodes: 1 - precision: 32 - -train: - num_epochs: 20 - val_every_n_epoch: 1 - grad_accum_from_epoch: 0 - grad_accum_every_n_batches: 1 - ckpt_resume_path: null - ckpt_every_n_epochs: 1 - early_stop_metric: null - early_stop_delta: 0.001 - early_stop_patience: 3 - -test: - ckpt_test_path: /PATH_TO_CKPT # used only when `modes.test=True` and `modes.train=False` - -logger: - log_to_wandb: true - project_name: SpokenWordRecognition - tags: ["audio", "classification", "cnn"] # cannot be empty - notes: null - log_every_n_steps: 10 - type: pytorch_lightning.loggers.MLFlowLogger - params: - experiment_name: Default - tracking_uri: mlruns \ No newline at end of file diff --git a/configs/SymbolRecognition_cnn.yaml b/configs/SymbolRecognition_cnn.yaml deleted file mode 100755 index dcc09b4..0000000 --- a/configs/SymbolRecognition_cnn.yaml +++ /dev/null @@ -1,67 +0,0 @@ -modes: - train: true - test: true - -task: classification - -data: - root: /Users/pme/data/MNIST - num_workers: 4 - batch_size: 256 - module: - target: deeplightning.data.dataloaders.image.mnist.MNIST - # the following are required as model inputs in some cases - image_size: 28 - num_channels: 1 - num_classes: 10 - -model: - module: - target: deeplightning.task.image.classification.ImageClassification - network: - target: deeplightning.models.cnn.SymbolCNN - params: - num_channels: ${data.num_channels} - num_classes: ${data.num_classes} - optimizer: - target: torch.optim.Adam - params: - lr: 0.001 - scheduler: - target: torch.optim.lr_scheduler.ExponentialLR - params: - gamma: 0.99 - call: - interval: "epoch" - frequency: 1 - loss: - target: torch.nn.CrossEntropyLoss - params: - -engine: - backend: deepspeed_stage_3 - gpus: null - num_nodes: 1 - precision: 32 - -train: - num_epochs: 3 - val_every_n_epoch: 1 - grad_accum_from_epoch: 0 - grad_accum_every_n_batches: 1 - ckpt_resume_path: null - ckpt_every_n_epochs: 1 - early_stop_metric: null - early_stop_delta: 0.001 - early_stop_patience: 3 - -logger: - log_to_wandb: true - project_name: SymbolRecognition - tags: ["image", "classification", mlpmixer] # cannot be empty - notes: null - log_every_n_steps: 10 - type: pytorch_lightning.loggers.MLFlowLogger - params: - experiment_name: Default - tracking_uri: mlruns \ No newline at end of file diff --git a/configs/SymbolRecognition_mlpmixer.yaml b/configs/SymbolRecognition_mlpmixer.yaml deleted file mode 100755 index 1b941e2..0000000 --- a/configs/SymbolRecognition_mlpmixer.yaml +++ /dev/null @@ -1,72 +0,0 @@ -modes: - train: true - test: true - -task: classification - -data: - root: /Users/pme/data/MNIST - num_workers: 4 - batch_size: 256 - module: - target: deeplightning.data.dataloaders.image.mnist.MNIST - # the following are required as model inputs in some cases - image_size: 28 - num_channels: 1 - num_classes: 10 - -model: - module: - target: deeplightning.task.image.classification.ImageClassification - network: - target: deeplightning.models.mlpmixer.MLPMixer - params: - image_size: ${data.image_size} - num_channels: ${data.num_channels} - num_classes: ${data.num_classes} - patch_size: 4 - dim: 512 - depth: 6 #12 - dropout: 0.2 - optimizer: - target: torch.optim.Adam - params: - lr: 0.001 - scheduler: - target: torch.optim.lr_scheduler.ExponentialLR - params: - gamma: 0.99 - call: - interval: "epoch" - frequency: 1 - loss: - target: torch.nn.CrossEntropyLoss - params: - -engine: - backend: deepspeed_stage_3 - gpus: null - num_nodes: 1 - precision: 32 - -train: - num_epochs: 3 - val_every_n_epoch: 1 - grad_accum_from_epoch: 0 - grad_accum_every_n_batches: 1 - ckpt_resume_path: null - ckpt_every_n_epochs: 1 - early_stop_metric: null - early_stop_delta: 0.001 - early_stop_patience: 3 - -logger: - log_to_wandb: true - project_name: SymbolRecognition - tags: ["image", "classification", mlpmixer] # cannot be empty - notes: null - log_every_n_steps: 10 - type: pytorch_lightning.loggers.MLFlowLogger - params: - experiment_name: Default - tracking_uri: mlruns \ No newline at end of file diff --git a/configs/SymbolRecognition_vit.yaml b/configs/SymbolRecognition_vit.yaml deleted file mode 100755 index 94bcbe5..0000000 --- a/configs/SymbolRecognition_vit.yaml +++ /dev/null @@ -1,74 +0,0 @@ -modes: - train: true - test: true - -task: classification - -data: - root: /Users/pme/data/MNIST - num_workers: 4 - batch_size: 256 - module: - target: deeplightning.data.dataloaders.image.mnist.MNIST - # the following are required as model inputs in some cases - image_size: 28 - num_channels: 1 - num_classes: 10 - -model: - module: - target: deeplightning.task.image.classification.ImageClassification - network: - target: deeplightning.models.vit.VisionTransformer - params: - image_size: ${data.image_size} - num_classes: ${data.num_classes} - num_channels: ${data.num_channels} - hidden_dim: 512 - embed_dim: 128 - num_heads: 8 - num_layers: 6 - patch_size: 4 - dropout: 0.2 - optimizer: - target: torch.optim.Adam - params: - lr: 0.001 - scheduler: - target: torch.optim.lr_scheduler.ExponentialLR - params: - gamma: 0.99 - call: - interval: "step" - frequency: 1 - loss: - target: torch.nn.CrossEntropyLoss - params: - -engine: - backend: deepspeed_stage_3 - gpus: null - num_nodes: 1 - precision: 32 - -train: - num_epochs: 3 - val_every_n_epoch: 1 - grad_accum_from_epoch: 0 - grad_accum_every_n_batches: 1 - ckpt_resume_path: null - ckpt_every_n_epochs: 1 - early_stop_metric: null - early_stop_delta: 0.001 - early_stop_patience: 3 - -logger: - log_to_wandb: true - project_name: SymbolRecognition - tags: ["image", "classification", "vit"] # cannot be empty - notes: null - log_every_n_steps: 10 - type: pytorch_lightning.loggers.MLFlowLogger - params: - experiment_name: Default - tracking_uri: mlruns \ No newline at end of file diff --git a/configs/_dummy.yaml b/tests/helpers/_dummy.yaml similarity index 100% rename from configs/_dummy.yaml rename to tests/helpers/_dummy.yaml diff --git a/tests/test_configs.py b/tests/test_configs.py index b5ab7f8..07ad810 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -1,20 +1,40 @@ import os -import sys -sys.path.insert(0, "..") +from glob import glob from omegaconf import OmegaConf -import pytest +from omegaconf.listconfig import ListConfig +from omegaconf.dictconfig import DictConfig from deeplightning.config.load import load_config +def check_all_keys_exist(cfg_base, cfg): + """Check if all keys in `cfg_base` exist in `cfg` + """ + if not isinstance(cfg_base, DictConfig): + return False + if not isinstance(cfg, DictConfig): + return False + for key in cfg_base.keys(): + if isinstance(cfg_base[key], DictConfig): + check_all_keys_exist(cfg_base[key], cfg[key]) + else: + if key not in cfg: + print(f"Key '{key}' not in config") + return False + return True + + def test_configs(): cfg_base = load_config(config_file="configs/_base.yaml") assert OmegaConf.is_config(cfg_base) - for cfg_filename in os.listdir("configs"): - if cfg_filename != "_base.yaml": - cfg = load_config(config_file = f"configs/{cfg_filename}") - assert OmegaConf.is_config(cfg) + configs = glob(os.path.join("configs", "*")) + glob(os.path.join("tests/helpers", "*")) + configs = [x for x in configs if x.endswith(".yaml") and not x.endswith("_base.yaml")] + + for cfg_fp in configs: + cfg = load_config(config_file=cfg_fp) + assert OmegaConf.is_config(cfg), f"Not a config '{cfg_fp}'" + assert check_all_keys_exist(cfg_base, cfg), f"Incorrect structure found in '{cfg_fp}'" \ No newline at end of file diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 76035ae..7e529c4 100755 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -35,7 +35,7 @@ ) def test_trainer(kwargs): - cfg = load_config(config_file = "configs/_dummy.yaml") + cfg = load_config(config_file = "tests/helpers/_dummy.yaml") cfg.engine.accelerator = kwargs["accelerator"] cfg.engine.strategy = kwargs["strategy"] From da14e62bf396d0013f1fcdba3c304c1051687427 Mon Sep 17 00:00:00 2001 From: pme0 <12113751+pme0@users.noreply.github.com> Date: Sun, 29 Oct 2023 22:56:38 +0000 Subject: [PATCH 05/19] fix cfg structure --- tests/helpers/_dummy.yaml | 8 ++++++++ tests/test_configs.py | 5 ++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/helpers/_dummy.yaml b/tests/helpers/_dummy.yaml index ffdfc28..39163bd 100755 --- a/tests/helpers/_dummy.yaml +++ b/tests/helpers/_dummy.yaml @@ -1,4 +1,9 @@ task: classification + +modes: + train: + test: + data: root: /Users/pme12/data/ dataset: MNIST @@ -60,6 +65,9 @@ train: early_stop_delta: 0.001 early_stop_patience: 3 +test: + ckpt_test_path: + logger: name: wandb project_name: unittests diff --git a/tests/test_configs.py b/tests/test_configs.py index 07ad810..719bf38 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -19,7 +19,6 @@ def check_all_keys_exist(cfg_base, cfg): check_all_keys_exist(cfg_base[key], cfg[key]) else: if key not in cfg: - print(f"Key '{key}' not in config") return False return True @@ -34,7 +33,7 @@ def test_configs(): for cfg_fp in configs: cfg = load_config(config_file=cfg_fp) - assert OmegaConf.is_config(cfg), f"Not a config '{cfg_fp}'" - assert check_all_keys_exist(cfg_base, cfg), f"Incorrect structure found in '{cfg_fp}'" + assert OmegaConf.is_config(cfg) + assert check_all_keys_exist(cfg_base, cfg) \ No newline at end of file From 0fceefaec46fe1edd4ac3d365d3e57a264331ee2 Mon Sep 17 00:00:00 2001 From: pme0 <12113751+pme0@users.noreply.github.com> Date: Thu, 2 Nov 2023 22:17:13 +0000 Subject: [PATCH 06/19] add param count print to base task --- .gitignore | 2 + deeplightning/task/base.py | 112 +++++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 deeplightning/task/base.py diff --git a/.gitignore b/.gitignore index c400dee..d77cacf 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,8 @@ tests/mlruns/ tests/logs/ tests/wandb/ tests/lightning_logs/ +# hydra +outputs/ # model weights *.pt diff --git a/deeplightning/task/base.py b/deeplightning/task/base.py new file mode 100644 index 0000000..50e41fc --- /dev/null +++ b/deeplightning/task/base.py @@ -0,0 +1,112 @@ +from typing import Tuple +from omegaconf import OmegaConf +import torch +from torch import Tensor +import lightning as pl + +from deeplightning.init.imports import init_obj_from_config +from deeplightning.init.initializers import init_metrics +from deeplightning.trainer.gather import gather_on_step, gather_on_epoch +from deeplightning.utils.messages import info_message + + + +class BaseTask(pl.LightningModule): + """Base task module + + LOGGING: manual logging `self.logger.log()` is used. This is more + flexible as Lightning automatic logging `self.log()`) only allows + scalars, not histograms, images, etc./ Additionally, auto-logging + doesn't log at step 0, which is useful. + + HOOKS: For *training*, the input to `training_epoch_end()` is the + set of outputs from `training_step()`. For *validation*, the input + to `validation_epoch_end()` is the output from `validation_step_end()` + and the input to `validation_step_end()` is the output from + `validation_step()`. + See https://github.com/PyTorchLightning/pytorch-lightning/issues/9811 + + Args + cfg: yaml configuration object + + """ + + def __init__(self, cfg: OmegaConf): + super().__init__() + self.cfg = cfg #TODO check if this contains logger runtime params + + # Lightning performs a partial validation epoch to ensure that + # everything is correct. Use this to avoid logging during that + self.sanity_check = True + + # Initialise metrics to track during training + self.device = torch.device("cuda") if cfg.engine.accelerator == "gpu" else torch.device('cpu') + self.metrics = init_metrics(cfg, device=self.device) + + # Initialise label to track metrics against + self.step_label = "iteration" + + # Aggregation utilities + self.gather_on_step = gather_on_step + self.gather_on_epoch = gather_on_epoch + + + def num_parameters(self): + """Prints the number of model parameters + + Lightning's model summary does not give the correct number + of trainable parameters. See + https://github.com/PyTorchLightning/pytorch-lightning/issues/12130 + """ + + trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad) + nontrainable_params = sum(p.numel() for p in self.model.parameters() if not p.requires_grad) + total_params = trainable_params + nontrainable_params + + info_message("Trainable model parameters: {:,d}".format(trainable_params)) + info_message("Non-trainable model parameters: {:,d}".format(nontrainable_params)) + info_message("Total model parameters: {:,d}".format(total_params)) + + + def forward(self, x: Tensor) -> Tensor: + raise NotImplementedError + + + def configure_optimizers(self) -> Tuple[dict]: + raise NotImplementedError + + + def training_step(self, batch, batch_idx): + raise NotImplementedError + + + def training_step_end(self): + raise NotImplementedError + + + def on_training_epoch_end(self): + raise NotImplementedError + + + def validation_step(self, batch, batch_idx): + raise NotImplementedError + + + def validation_step_end(self): + raise NotImplementedError + + + def on_validation_epoch_end(self): + raise NotImplementedError + + + def test_step(self, batch, batch_idx): + raise NotImplementedError + + + def test_step_end(self): + raise NotImplementedError + + + def on_test_epoch_end(self): + raise NotImplementedError \ No newline at end of file From fecf019d4c8fe7cbed1143d34cf79ebd54461f17 Mon Sep 17 00:00:00 2001 From: pme0 <12113751+pme0@users.noreply.github.com> Date: Thu, 2 Nov 2023 23:57:54 +0000 Subject: [PATCH 07/19] make num model params attribute read-only --- deeplightning/task/base.py | 54 +++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/deeplightning/task/base.py b/deeplightning/task/base.py index 50e41fc..c343514 100644 --- a/deeplightning/task/base.py +++ b/deeplightning/task/base.py @@ -39,10 +39,6 @@ def __init__(self, cfg: OmegaConf): # everything is correct. Use this to avoid logging during that self.sanity_check = True - # Initialise metrics to track during training - self.device = torch.device("cuda") if cfg.engine.accelerator == "gpu" else torch.device('cpu') - self.metrics = init_metrics(cfg, device=self.device) - # Initialise label to track metrics against self.step_label = "iteration" @@ -50,24 +46,46 @@ def __init__(self, cfg: OmegaConf): self.gather_on_step = gather_on_step self.gather_on_epoch = gather_on_epoch - - def num_parameters(self): - """Prints the number of model parameters - - Lightning's model summary does not give the correct number - of trainable parameters. See - https://github.com/PyTorchLightning/pytorch-lightning/issues/12130 + + def on_task_init_end(self): + """Attributes to initialise at the end of the `__init__` method + of the class that inherits from this `BaseTask` class. """ + self.set_num_model_params() + self.print_num_model_params() + + + @property + def num_trainable_params(self): + return self._num_trainable_params - trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad) - nontrainable_params = sum(p.numel() for p in self.model.parameters() if not p.requires_grad) - total_params = trainable_params + nontrainable_params - - info_message("Trainable model parameters: {:,d}".format(trainable_params)) - info_message("Non-trainable model parameters: {:,d}".format(nontrainable_params)) - info_message("Total model parameters: {:,d}".format(total_params)) + + @property + def num_nontrainable_params(self): + return self._num_nontrainable_params + + @property + def num_total_params(self): + return self._num_total_params + + + def set_num_model_params(self): + self._num_trainable_params = sum( + p.numel() for p in self.model.parameters() if p.requires_grad + ) + self._num_nontrainable_params = sum( + p.numel() for p in self.model.parameters() if not p.requires_grad + ) + self._num_total_params = self._num_trainable_params + self._num_nontrainable_params + + def print_num_model_params(self): + info_message("Trainable model parameters: {:,d}".format(self.num_trainable_params)) + info_message("Non-trainable model parameters: {:,d}".format(self.num_nontrainable_params)) + info_message("Total model parameters: {:,d}".format(self.num_total_params)) + + def forward(self, x: Tensor) -> Tensor: raise NotImplementedError From a70a5ce73898ada1cb880273ec6751a372c53520 Mon Sep 17 00:00:00 2001 From: pme0 <12113751+pme0@users.noreply.github.com> Date: Thu, 2 Nov 2023 23:59:20 +0000 Subject: [PATCH 08/19] remove temp file --- condaenv.sqtapdy1.requirements.txt | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 condaenv.sqtapdy1.requirements.txt diff --git a/condaenv.sqtapdy1.requirements.txt b/condaenv.sqtapdy1.requirements.txt deleted file mode 100644 index 9fb1cdc..0000000 --- a/condaenv.sqtapdy1.requirements.txt +++ /dev/null @@ -1,21 +0,0 @@ -colorama==0.4.4 -deepspeed==0.5.10 -einops==0.4.0 -flask==2.0.3 -imagesize==1.4.1 -ipython -librosa==0.9.2 -lightning==2.0.0 -matplotlib==3.5.1 -numpy==1.23.5 -omegaconf==2.1.1 -opencv-python==4.7.0.72 -pandas==1.5.3 -pyyaml==6.0 -seaborn==0.12.0 -torch==2.0.0 -torchaudio==2.0.1 -torchlibrosa==0.1.0 -torchmetrics==0.11.4 -torchvision==0.15.1 -wandb==0.12.21 \ No newline at end of file From 05e16850798e9e17434a0c0480761861c2b96388 Mon Sep 17 00:00:00 2001 From: pme0 <12113751+pme0@users.noreply.github.com> Date: Fri, 3 Nov 2023 00:23:20 +0000 Subject: [PATCH 09/19] move registries initialisations to __ini__.py --- deeplightning/__init__.py | 7 +++++++ deeplightning/model/cnn.py | 4 +--- deeplightning/registry.py | 8 -------- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/deeplightning/__init__.py b/deeplightning/__init__.py index e69de29..466ac42 100755 --- a/deeplightning/__init__.py +++ b/deeplightning/__init__.py @@ -0,0 +1,7 @@ +from deeplightning.registry import Registry + + +TASK_REGISTRY = Registry("tasks") +MODEL_REGISTRY = Registry("models") +DATA_REGISTRY = Registry("datasets") +METRIC_REGISTRY = Registry("metrics") \ No newline at end of file diff --git a/deeplightning/model/cnn.py b/deeplightning/model/cnn.py index aba6572..434b797 100755 --- a/deeplightning/model/cnn.py +++ b/deeplightning/model/cnn.py @@ -12,8 +12,8 @@ "spectrogram_cnn", ] + class SymbolCNN(nn.Module): - def __init__(self, num_classes: int, num_channels: int): super().__init__() self.num_classes = num_classes @@ -39,7 +39,6 @@ def forward(self, x): class SpectrogramCNN(nn.Module): - def __init__(self, num_classes: int, num_channels: int): super().__init__() self.num_classes = num_classes @@ -65,7 +64,6 @@ def __init__(self, num_classes: int, num_channels: int): nn.ReLU(), nn.MaxPool2d(2)) self.fc = nn.Linear(64 * 5 * 5, num_classes) - #self.dropout = nn.Dropout(p=0.1) def forward(self, x): x = self.conv1(x) diff --git a/deeplightning/registry.py b/deeplightning/registry.py index 6009bde..e2c031f 100755 --- a/deeplightning/registry.py +++ b/deeplightning/registry.py @@ -88,14 +88,6 @@ def get_element_names(self) -> List: return sorted(list(self.elements_dict.keys())) -TASK_REGISTRY = Registry("tasks") -MODEL_REGISTRY = Registry("models") -DATA_REGISTRY = Registry("datasets") -METRICS_REGISTRY = Registry("metrics") - - - - __TaskRegistry__ = [ # Image "ImageClassification", From 1804f9dfd9194fe25e534c077bb8edf5f513f0e2 Mon Sep 17 00:00:00 2001 From: pme0 <12113751+pme0@users.noreply.github.com> Date: Fri, 3 Nov 2023 23:28:07 +0000 Subject: [PATCH 10/19] clean metrics --- README.md | 127 +----------------- configs/SkinLesionSegmentation.yaml | 2 +- deeplightning/init/initializers.py | 2 +- deeplightning/registry.py | 46 +------ deeplightning/task/vision/segmentation.py | 11 +- .../hooks/SemanticSegmentation_hooks.py | 3 +- deeplightning/utils/metrics.py | 42 +++++- 7 files changed, 57 insertions(+), 176 deletions(-) diff --git a/README.md b/README.md index 2b584a7..3c8b9da 100755 --- a/README.md +++ b/README.md @@ -1,11 +1,12 @@ +<<< in active development >>> +

Deep Lightning

- - +

**Deep Lightning** is a configuration-based wrapper for training Deep Learning models with focus on parallel training, cross-platform compatibility and reproducibility. The philosophy is simple: from configuration to trackable and reproducible deep learning. @@ -40,10 +41,7 @@ trainer.fit(model, data) * [Run](#run) * [Configure](#configure) * [Customize](#customize) -* [Examples](#examples) -* [Results](#results) -* [Development](#development) -* [Further Reading](#further-reading) +* [Examples](#examples) # Overview @@ -113,25 +111,8 @@ When a training run has been initiated, a link will be displayed in the terminal All config fields labelled `type` correspond to target classes. The format is `MODULE.CLASS` and the code will load class `CLASS` from `MODULE.py` (relative path). Note that `MODULE` can itself be composite, `X.Y.Z`, in which case the class `CLASS` will be loaded from `X/Y/Z.py`. For example, `model.optimizer.target` could be existing `deepspeed.ops.adam.FusedAdam` or user-defined in `losses.custom.MyLoss`. -### Example +Example: ```yaml -modes: - train: true - test: false - -task: ImageClassification - -data: - root: /data - dataset: MNIST - image_size: 28 - num_channels: 1 - num_classes: 10 - num_workers: 4 - batch_size: 256 - module: - target: deeplightning.data.dataloaders.image.mnist.MNIST - model: module: target: deeplightning.task.image.classification.TaskModule @@ -140,49 +121,6 @@ model: params: num_classes: 10 num_channels: 1 - optimizer: - target: torch.optim.SGD - params: - lr: 0.01 - weight_decay: 0.01 - momentum: 0.9 - scheduler: - target: torch.optim.lr_scheduler.ExponentialLR - params: - gamma: 0.99 - call: - interval: "epoch" - frequency: 1 - loss: - target: torch.nn.CrossEntropyLoss - params: - -engine: - accelerator: cpu - strategy: auto - devices: 1 - num_nodes: 1 - precision: 32 - -train: - num_epochs: 1 - val_every_n_epoch: 1 - grad_accum_from_epoch: 0 - grad_accum_every_n_batches: 1 - ckpt_resume_path: null - ckpt_monitor_metric: val_acc # used in `ModelCheckpoint` callback - ckpt_every_n_epochs: 1 - ckpt_save_top_k: 1 - early_stop_metric: null # used in `EarlyStopping` callback - early_stop_delta: 0.001 - early_stop_patience: 3 - -logger: - name: wandb - project_name: trial - tags: ["_"] # cannot be empty - notes: null - log_every_n_steps: 20 ``` ### Customize @@ -199,58 +137,3 @@ Beyond changing parameters values in existing configs, you can customize the fol See [`examples`](https://github.com/pme0/DeepLightning/tree/master/examples) for details. - -# Results - -[results on acceleration, memory use, etc.] - - -# Development - -### Functionalities -- [x] tracking logger (losses, learning rate, etc.) -- [x] artifact storing (config, image, etc.) -- [x] parallel training - - [x] multi-gpu - - [x] multi-node - - [x] backend engines: - - [x] ddp - - [x] deepspeed_stage_1 - - [x] deepspeed_stage_2 - - [ ] deepspeed_stage_3 (TODO resuming, sharded initialization) -- [x] 16-bit precision -- [x] periodic model checkpoints -- [ ] resume training from model checkpoint --- `deepspeed` untested [[docs](https://pytorch-lightning.readthedocs.io/en/stable/advanced/advanced_gpu.html#deepspeed)] [[docs](https://pytorch-lightning.readthedocs.io/en/stable/advanced/advanced_gpu.html#collating-single-file-checkpoint-for-deepspeed-zero-stage-3)]; -- [ ] sharded loading via LightningModule hook `configure_sharded_model(self):` [[docs](https://pytorch-lightning.readthedocs.io/en/latest/advanced/model_parallel.html#enabling-module-sharding-for-maximum-memory-efficiency)]; -- [x] gradient accumulation -- [x] early stopping -- [x] prediction API [TODO: add batch support] -- [ ] multiple losses/optimizers e.g. GAN; [[docs](https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html?highlight=configure_optimizers#configure-optimizers)]; though deepspeed doesn't allow this atm "DeepSpeed currently only supports single optimizer, single scheduler within the training loop." [[docs](https://pytorch-lightning.readthedocs.io/en/stable/advanced/advanced_gpu.html#deepspeed)] -- [x] reproducible examples - - [x] image classification - - [x] image reconstruction -- [ ] model registry - - -### Notes - -- :triangular_flag_on_post: on `deepspeed=0.5.10`, optimizer `deepspeed.ops.adam.FusedAdam` gives `AssertionError: CUDA_HOME does not exist, unable to compile CUDA op(s)`. Mentioned in issue [#1279](https://github.com/microsoft/DeepSpeed/issues/1279); -- :warning: effective batch size is `batch * num_gpus * num_nodes` [[docs](https://pytorch-lightning.readthedocs.io/en/stable/advanced/multi_gpu.html#batch-size)] but huge batch size can cause convergence difficulties [[paper](https://arxiv.org/abs/1706.02677)]; -- :warning: deepspeed single-file checkpointing requires caution [[docs](https://pytorch-lightning.readthedocs.io/en/latest/advanced/advanced_gpu.html#collating-single-file-checkpoint-for-deepspeed-zero-stage-3)] [[docs](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.plugins.training_type.DeepSpeedPlugin.html)] - - -# Further Reading - -**Pytorch-Lightning** organises modules as hardware-agnostic and loop-less code; separated model and backend engine for scalable deep learning -|| :information_source: [website](https://lightning.ai/) | :floppy_disk: [github](https://github.com/Lightning-AI/lightning) || - -**Weights & Biases** tracks machine learning experiments and provided real-time visualisation via a web interface -|| :information_source: [website](https://wandb.ai/site) | :floppy_disk: [github](https://github.com/wandb/wandb) || - -**DeepSpeed** is a distributed backend which reduces the training memory footprint with a Zero Redundancy Optimizer (ZeRO). It partitions model states and gradients to save memory, unlike traditional data parallelism where memory states are replicated across data-parallel processes. This allows training of large models with large batch sizes -|| :information_source: [website](https://www.deepspeed.ai) | :floppy_disk: [github](https://github.com/microsoft/DeepSpeed) | :page_with_curl: [ZeRO-3](https://arxiv.org/abs/1910.02054) || - -**Flask** is a server-side web framework that supports building and deploying web applications such as ML prediction APIs -|| :information_source: [website](https://flask.palletsprojects.com) | :floppy_disk: [github](https://github.com/pallets/flask) || - - diff --git a/configs/SkinLesionSegmentation.yaml b/configs/SkinLesionSegmentation.yaml index 493df94..c81f0ae 100755 --- a/configs/SkinLesionSegmentation.yaml +++ b/configs/SkinLesionSegmentation.yaml @@ -75,7 +75,7 @@ test: logger: name: wandb - project_name: trial + project_name: skinlesion tags: ["_"] # cannot be empty notes: null log_every_n_steps: 20 \ No newline at end of file diff --git a/deeplightning/init/initializers.py b/deeplightning/init/initializers.py index dfa916c..bcc6e09 100755 --- a/deeplightning/init/initializers.py +++ b/deeplightning/init/initializers.py @@ -7,7 +7,7 @@ from deeplightning.config.defaults import __ConfigGroups__ from deeplightning.init.imports import init_module from deeplightning.trainer.trainer import DLTrainer -#from deeplightning.registry import __MetricsRegistry__ +#from deeplightning.utils.metrics import __MetricsRegistry__ diff --git a/deeplightning/registry.py b/deeplightning/registry.py index e2c031f..88a1ce0 100755 --- a/deeplightning/registry.py +++ b/deeplightning/registry.py @@ -1,7 +1,6 @@ from lightning.pytorch.loggers import WandbLogger #from deeplightning.logger.wandb import wandbLogger -""" from deeplightning.trainer.hooks.ImageClassification_hooks import ( training_step__ImageClassification, training_step_end__ImageClassification, @@ -23,19 +22,7 @@ test_step_end__SemanticSegmentation, on_test_epoch_end__SemanticSegmentation) -from deeplightning.trainer.hooks.AudioClassification_hooks import ( - training_step__AudioClassification, - training_step_end__AudioClassification, - training_epoch_end__AudioClassification, - validation_step__AudioClassification, - validation_step_end__AudioClassification, - validation_epoch_end__AudioClassification, - test_step__AudioClassification, - test_step_end__AudioClassification, - test_epoch_end__AudioClassification) -from deeplightning.utils.metrics import Metric_Accuracy, Metric_ConfusionMatrix, Metric_PrecisionRecallCurve -""" from typing import Any, Callable, List, Type, TypeVar T = TypeVar('T') @@ -103,7 +90,7 @@ def get_element_names(self) -> List: } -''' + __HooksRegistry__ = { # Image "ImageClassification": { @@ -149,34 +136,3 @@ def get_element_names(self) -> List: } -__MetricsRegistry__ = { - # Image - "ImageClassification": { - "Accuracy_train": Metric_Accuracy, - "Accuracy_val": Metric_Accuracy, - "Accuracy_test": Metric_Accuracy, - "ConfusionMatrix_val": Metric_ConfusionMatrix, - "ConfusionMatrix_test": Metric_ConfusionMatrix, - "PrecisionRecallCurve_val": Metric_PrecisionRecallCurve, - "PrecisionRecallCurve_test": Metric_PrecisionRecallCurve, - }, - "ImageReconstruction": { - "_": None, - }, - "ObjectDetection": { - "_": None, - }, - "SemanticSegmentation": { - "Accuracy_train": Metric_Accuracy, - "Accuracy_val": Metric_Accuracy, - "Accuracy_test": Metric_Accuracy, - }, - # Audio - "AudioClassification":{ - "Accuracy": Metric_Accuracy, - "ConfusionMatrix": Metric_ConfusionMatrix, - "PrecisionRecallCurve": Metric_PrecisionRecallCurve, - }, -} - -''' \ No newline at end of file diff --git a/deeplightning/task/vision/segmentation.py b/deeplightning/task/vision/segmentation.py index 4320de3..b0b5d4e 100755 --- a/deeplightning/task/vision/segmentation.py +++ b/deeplightning/task/vision/segmentation.py @@ -8,7 +8,8 @@ from deeplightning.init.initializers import init_metrics from deeplightning.trainer.gather import gather_on_step, gather_on_epoch from deeplightning.utils.messages import info_message -from deeplightning.registry import __MetricsRegistry__, __HooksRegistry__ +from deeplightning.registry import __HooksRegistry__ +from deeplightning.utils.metrics import classification_accuracy @@ -48,7 +49,13 @@ def __init__(self, cfg: OmegaConf): # Initialise metrics to track during training torch_device = torch.device("cuda") if cfg.engine.accelerator == "gpu" else torch.device('cpu') - self.metrics = init_metrics(cfg, device=torch_device) + + #self.metrics = init_metrics(cfg, device=torch_device) + self.metrics = { + "Accuracy_train": classification_accuracy(cfg), + "Accuracy_val": classification_accuracy(cfg), + "Accuracy_test": classification_accuracy(cfg), + } # Initialise label to track metrics against self.step_label = "iteration" diff --git a/deeplightning/trainer/hooks/SemanticSegmentation_hooks.py b/deeplightning/trainer/hooks/SemanticSegmentation_hooks.py index 784aca9..13c4d99 100644 --- a/deeplightning/trainer/hooks/SemanticSegmentation_hooks.py +++ b/deeplightning/trainer/hooks/SemanticSegmentation_hooks.py @@ -85,12 +85,13 @@ def validation_step__SemanticSegmentation(self, batch, batch_idx): outputs = process_model_outputs(outputs, self.model) preds = torch.argmax(outputs, dim=1) - raise + ''' for i in range(5): print(batch["inputs_paths"][i]) print(batch["masks_paths"][i]) save_image(preds[0].unsqueeze(0).float(), fp=f"/Users/pme/Downloads/segm/mask_step{self.global_step}.jpeg") i += 1 + ''' # loss val_loss = self.loss(outputs, batch["masks"]) diff --git a/deeplightning/utils/metrics.py b/deeplightning/utils/metrics.py index 9aeb698..1598587 100755 --- a/deeplightning/utils/metrics.py +++ b/deeplightning/utils/metrics.py @@ -12,7 +12,8 @@ from matplotlib.figure import Figure from matplotlib import pyplot as plt -from deeplightning.registry import METRICS_REGISTRY +from deeplightning import METRIC_REGISTRY +#from deeplightning.utils.metrics import Metric_Accuracy, Metric_ConfusionMatrix, Metric_PrecisionRecallCurve __all__ = [ @@ -22,6 +23,39 @@ ] +''' +__MetricsRegistry__ = { + # Image + "ImageClassification": { + "Accuracy_train": Metric_Accuracy, + "Accuracy_val": Metric_Accuracy, + "Accuracy_test": Metric_Accuracy, + "ConfusionMatrix_val": Metric_ConfusionMatrix, + "ConfusionMatrix_test": Metric_ConfusionMatrix, + "PrecisionRecallCurve_val": Metric_PrecisionRecallCurve, + "PrecisionRecallCurve_test": Metric_PrecisionRecallCurve, + }, + "ImageReconstruction": { + "_": None, + }, + "ObjectDetection": { + "_": None, + }, + "SemanticSegmentation": { + "Accuracy_train": Metric_Accuracy, + "Accuracy_val": Metric_Accuracy, + "Accuracy_test": Metric_Accuracy, + }, + # Audio + "AudioClassification":{ + "Accuracy": Metric_Accuracy, + "ConfusionMatrix": Metric_ConfusionMatrix, + "PrecisionRecallCurve": Metric_PrecisionRecallCurve, + }, +} +''' + + class ClassificationAccuracy(MulticlassAccuracy): """Classification Accuracy metric, inheriting from torchmetrics """ @@ -33,7 +67,7 @@ def __init__(self, cfg: OmegaConf): super().__init__(**args) -@METRICS_REGISTRY.register_element() +@METRIC_REGISTRY.register_element() def classification_accuracy(cfg) -> ClassificationAccuracy: return ClassificationAccuracy(cfg) @@ -80,7 +114,7 @@ def draw(self, return fig -@METRICS_REGISTRY.register_element() +@METRIC_REGISTRY.register_element() def precision_recall_curve(cfg) -> PrecisionRecallCurve: return PrecisionRecallCurve(cfg) @@ -130,7 +164,7 @@ def draw(self, return fig -@METRICS_REGISTRY.register_element() +@METRIC_REGISTRY.register_element() def confusion_matrix(cfg) -> ConfusionMatrix: return ConfusionMatrix(cfg) From dce0b46d63396ff0433bebda9b6e2d543463df7f Mon Sep 17 00:00:00 2001 From: pme0 <12113751+pme0@users.noreply.github.com> Date: Sat, 4 Nov 2023 15:36:34 +0000 Subject: [PATCH 11/19] add docs for class attributes --- configs/SkinLesionSegmentation.yaml | 2 +- deeplightning/task/base.py | 76 +++++++++++++++-------------- 2 files changed, 40 insertions(+), 38 deletions(-) diff --git a/configs/SkinLesionSegmentation.yaml b/configs/SkinLesionSegmentation.yaml index c81f0ae..6d6aef2 100755 --- a/configs/SkinLesionSegmentation.yaml +++ b/configs/SkinLesionSegmentation.yaml @@ -75,7 +75,7 @@ test: logger: name: wandb - project_name: skinlesion + project_name: skin-lesion-segmentation tags: ["_"] # cannot be empty notes: null log_every_n_steps: 20 \ No newline at end of file diff --git a/deeplightning/task/base.py b/deeplightning/task/base.py index c343514..cdc0bd8 100644 --- a/deeplightning/task/base.py +++ b/deeplightning/task/base.py @@ -12,65 +12,67 @@ class BaseTask(pl.LightningModule): - """Base task module - - LOGGING: manual logging `self.logger.log()` is used. This is more - flexible as Lightning automatic logging `self.log()`) only allows - scalars, not histograms, images, etc./ Additionally, auto-logging - doesn't log at step 0, which is useful. - - HOOKS: For *training*, the input to `training_epoch_end()` is the - set of outputs from `training_step()`. For *validation*, the input - to `validation_epoch_end()` is the output from `validation_step_end()` - and the input to `validation_step_end()` is the output from - `validation_step()`. - See https://github.com/PyTorchLightning/pytorch-lightning/issues/9811 - - Args + """Base task module. + + Notes: + logging: manual logging `self.logger.log()` is used. This is more + flexible as Lightning automatic logging `self.log()`) only + allows scalars, not histograms, images, etc./ Additionally, + auto-logging doesn't log at step 0, which is useful. + hooks: For *training*, the input to `training_epoch_end()` is the + set of outputs from `training_step()`. For *validation*, the + input to `validation_epoch_end()` is the output from + `validation_step_end()` and the input to `validation_step_end()` + is the output from `validation_step()`. See + https://github.com/PyTorchLightning/pytorch-lightning/issues/9811 + + Args: cfg: yaml configuration object - - """ - def __init__(self, cfg: OmegaConf): + Attributes: + cfg: (OmegaConf) yaml configuration object. + step_label: (str) label to track/log metrics against. + sanity_check: (bool) Lightning performs a partial validation epoch + at the start, to ensure no issues at the validation stage. The + attribute `sanity_check` is set to `True` initially and set to + `False` after the sanity check run is complete. This is to + prevent logging during that preliminary run. + """ + def __init__(self, cfg: OmegaConf) -> None: super().__init__() self.cfg = cfg #TODO check if this contains logger runtime params - - # Lightning performs a partial validation epoch to ensure that - # everything is correct. Use this to avoid logging during that - self.sanity_check = True - - # Initialise label to track metrics against self.step_label = "iteration" - - # Aggregation utilities - self.gather_on_step = gather_on_step - self.gather_on_epoch = gather_on_epoch - + self.sanity_check = True - def on_task_init_end(self): - """Attributes to initialise at the end of the `__init__` method - of the class that inherits from this `BaseTask` class. + def on_task_init_end(self) -> None: + """Additional attributes to initialise at the end of the `__init__` + method of the class that inherits from this `BaseTask` class. + + Attributes: + num_trainable_params: (int) mumber of trainable model parameters. + num_nontrainable_params: (int) mumber of nontrainable model parameters. + num_total_params: (int) mumber of total model parameters. """ self.set_num_model_params() self.print_num_model_params() @property - def num_trainable_params(self): + def num_trainable_params(self) -> int: return self._num_trainable_params @property - def num_nontrainable_params(self): + def num_nontrainable_params(self) -> int: return self._num_nontrainable_params @property - def num_total_params(self): + def num_total_params(self) -> int: return self._num_total_params - def set_num_model_params(self): + def set_num_model_params(self) -> None: self._num_trainable_params = sum( p.numel() for p in self.model.parameters() if p.requires_grad ) @@ -80,7 +82,7 @@ def set_num_model_params(self): self._num_total_params = self._num_trainable_params + self._num_nontrainable_params - def print_num_model_params(self): + def print_num_model_params(self) -> None: info_message("Trainable model parameters: {:,d}".format(self.num_trainable_params)) info_message("Non-trainable model parameters: {:,d}".format(self.num_nontrainable_params)) info_message("Total model parameters: {:,d}".format(self.num_total_params)) From 06f74f15577b1eec165300e68e0688bc033ad6b5 Mon Sep 17 00:00:00 2001 From: pme0 <12113751+pme0@users.noreply.github.com> Date: Sat, 4 Nov 2023 16:37:08 +0000 Subject: [PATCH 12/19] add base task class --- configs/SkinLesionSegmentation.yaml | 2 +- deeplightning/task/vision/segmentation.py | 42 +++++++------------ .../hooks/SemanticSegmentation_hooks.py | 12 +++++- deeplightning/trainer/utils.py | 9 ---- 4 files changed, 27 insertions(+), 38 deletions(-) delete mode 100644 deeplightning/trainer/utils.py diff --git a/configs/SkinLesionSegmentation.yaml b/configs/SkinLesionSegmentation.yaml index 6d6aef2..d8c9b40 100755 --- a/configs/SkinLesionSegmentation.yaml +++ b/configs/SkinLesionSegmentation.yaml @@ -26,7 +26,7 @@ data: model: module: - target: deeplightning.task.vision.segmentation.TaskModule + target: deeplightning.task.vision.segmentation.SemanticSegmentationTask network: target: torchvision.models.segmentation.deeplabv3_resnet50 params: diff --git a/deeplightning/task/vision/segmentation.py b/deeplightning/task/vision/segmentation.py index b0b5d4e..eef4515 100755 --- a/deeplightning/task/vision/segmentation.py +++ b/deeplightning/task/vision/segmentation.py @@ -10,29 +10,17 @@ from deeplightning.utils.messages import info_message from deeplightning.registry import __HooksRegistry__ from deeplightning.utils.metrics import classification_accuracy +from deeplightning.task.base import BaseTask +class SemanticSegmentationTask(BaseTask): + """ Task module for Semantic Segmentation. -class TaskModule(pl.LightningModule): - """ Task module for Image Classification. - - LOGGING: manual logging `self.logger.log()` is used. This - is more flexible as PyTorchLightning automatic logging - `self.log()`) only allows scalars, not histograms, images, etc. - Additionally, auto-logging doesn't log at step 0, which is useful. - - Parameters - ---------- - cfg : yaml configuration object - + Args: + cfg: yaml configuration object """ - def __init__(self, cfg: OmegaConf): - super().__init__() - self.cfg = cfg #TODO check if this contains logger runtime params - self.num_classes = cfg.model.network.params.num_classes - #self.classif_task = "binary" if self.num_classes == 2 else "multiclass" - + super().__init__(cfg=cfg) self.loss = init_obj_from_config(cfg.model.loss) self.model = init_obj_from_config(cfg.model.network) self.optimizer = init_obj_from_config(cfg.model.optimizer, self.model.parameters()) @@ -64,15 +52,15 @@ def __init__(self, cfg: OmegaConf): # to make the hooks bound to the class (so that they can access class attributes # using `self.something`), the assignment must specify the class name as follows: # `ClassName.fn = my_fn` rather than `self.fn = my_fn` - TaskModule._training_step = __HooksRegistry__[cfg.task]["training_step"] - TaskModule._training_step_end = __HooksRegistry__[cfg.task]["training_step_end"] - TaskModule._on_training_epoch_end = __HooksRegistry__[cfg.task]["on_training_epoch_end"] - TaskModule._validation_step = __HooksRegistry__[cfg.task]["validation_step"] - TaskModule._validation_step_end = __HooksRegistry__[cfg.task]["validation_step_end"] - TaskModule._on_validation_epoch_end = __HooksRegistry__[cfg.task]["on_validation_epoch_end"] - TaskModule._test_step = __HooksRegistry__[cfg.task]["test_step"] - TaskModule._test_step_end = __HooksRegistry__[cfg.task]["test_step_end"] - TaskModule._on_test_epoch_end = __HooksRegistry__[cfg.task]["on_test_epoch_end"] + SemanticSegmentationTask._training_step = __HooksRegistry__[cfg.task]["training_step"] + SemanticSegmentationTask._training_step_end = __HooksRegistry__[cfg.task]["training_step_end"] + SemanticSegmentationTask._on_training_epoch_end = __HooksRegistry__[cfg.task]["on_training_epoch_end"] + SemanticSegmentationTask._validation_step = __HooksRegistry__[cfg.task]["validation_step"] + SemanticSegmentationTask._validation_step_end = __HooksRegistry__[cfg.task]["validation_step_end"] + SemanticSegmentationTask._on_validation_epoch_end = __HooksRegistry__[cfg.task]["on_validation_epoch_end"] + SemanticSegmentationTask._test_step = __HooksRegistry__[cfg.task]["test_step"] + SemanticSegmentationTask._test_step_end = __HooksRegistry__[cfg.task]["test_step_end"] + SemanticSegmentationTask._on_test_epoch_end = __HooksRegistry__[cfg.task]["on_test_epoch_end"] # Aggregation utilities self.gather_on_step = gather_on_step diff --git a/deeplightning/trainer/hooks/SemanticSegmentation_hooks.py b/deeplightning/trainer/hooks/SemanticSegmentation_hooks.py index 13c4d99..6f71d33 100644 --- a/deeplightning/trainer/hooks/SemanticSegmentation_hooks.py +++ b/deeplightning/trainer/hooks/SemanticSegmentation_hooks.py @@ -4,9 +4,19 @@ from deeplightning.trainer.batch import dictionarify_batch from deeplightning.trainer.gather import gather_on_step, gather_on_epoch -from deeplightning.trainer.utils import process_model_outputs +def process_model_outputs(outputs, model): + """Processes model outouts and selects the appropriate elements + """ + if model.__class__.__name__ == "DeepLabV3": + # `DeepLabV3` returns a dictionaty with keys `out` (segmentation + # mask) and optionally `aux` if an auxiliary classifier is used. + return outputs["out"] + else: + return outputs + + def training_step__SemanticSegmentation(self, batch, batch_idx): """ Hook for `training_step`. diff --git a/deeplightning/trainer/utils.py b/deeplightning/trainer/utils.py deleted file mode 100644 index 9bf941b..0000000 --- a/deeplightning/trainer/utils.py +++ /dev/null @@ -1,9 +0,0 @@ - -def process_model_outputs(outputs, model): - """Processes model outouts and selects the appropriate elements - """ - - if model.__class__.__name__ == "DeepLabV3": - # `DeepLabV3` returns a dictionaty with keys `out` (segmentation mask) - # and optionally `aux` if an auxiliary classifier is used. - return outputs["out"] \ No newline at end of file From d080049e4b18f4ad03ddbb995946f17a3b6a459a Mon Sep 17 00:00:00 2001 From: pme0 <12113751+pme0@users.noreply.github.com> Date: Sat, 4 Nov 2023 16:51:58 +0000 Subject: [PATCH 13/19] bring hooks inside lightningmodule --- deeplightning/task/specs.py | 1 - deeplightning/task/vision/segmentation.py | 241 +++++++++++++++++++--- deeplightning/trainer/batch.py | 2 +- 3 files changed, 209 insertions(+), 35 deletions(-) diff --git a/deeplightning/task/specs.py b/deeplightning/task/specs.py index 1bf471b..97c238a 100644 --- a/deeplightning/task/specs.py +++ b/deeplightning/task/specs.py @@ -22,7 +22,6 @@ def __init__(self, cfg: OmegaConf): class ImageClassificationTask(TaskSpecification): def __init__(self, cfg: OmegaConf): super().__init__() - if cfg.task = self.metrics = [ "classification_accuracy", ] diff --git a/deeplightning/task/vision/segmentation.py b/deeplightning/task/vision/segmentation.py index eef4515..9c40876 100755 --- a/deeplightning/task/vision/segmentation.py +++ b/deeplightning/task/vision/segmentation.py @@ -11,8 +11,20 @@ from deeplightning.registry import __HooksRegistry__ from deeplightning.utils.metrics import classification_accuracy from deeplightning.task.base import BaseTask +from deeplightning.trainer.batch import dictionarify_batch +def process_model_outputs(outputs, model): + """Processes model outouts and selects the appropriate elements + """ + if model.__class__.__name__ == "DeepLabV3": + # `DeepLabV3` returns a dictionaty with keys `out` (segmentation + # mask) and optionally `aux` if an auxiliary classifier is used. + return outputs["out"] + else: + return outputs + + class SemanticSegmentationTask(BaseTask): """ Task module for Semantic Segmentation. @@ -52,19 +64,19 @@ def __init__(self, cfg: OmegaConf): # to make the hooks bound to the class (so that they can access class attributes # using `self.something`), the assignment must specify the class name as follows: # `ClassName.fn = my_fn` rather than `self.fn = my_fn` - SemanticSegmentationTask._training_step = __HooksRegistry__[cfg.task]["training_step"] - SemanticSegmentationTask._training_step_end = __HooksRegistry__[cfg.task]["training_step_end"] - SemanticSegmentationTask._on_training_epoch_end = __HooksRegistry__[cfg.task]["on_training_epoch_end"] - SemanticSegmentationTask._validation_step = __HooksRegistry__[cfg.task]["validation_step"] - SemanticSegmentationTask._validation_step_end = __HooksRegistry__[cfg.task]["validation_step_end"] - SemanticSegmentationTask._on_validation_epoch_end = __HooksRegistry__[cfg.task]["on_validation_epoch_end"] - SemanticSegmentationTask._test_step = __HooksRegistry__[cfg.task]["test_step"] - SemanticSegmentationTask._test_step_end = __HooksRegistry__[cfg.task]["test_step_end"] - SemanticSegmentationTask._on_test_epoch_end = __HooksRegistry__[cfg.task]["on_test_epoch_end"] + #SemanticSegmentationTask._training_step = __HooksRegistry__[cfg.task]["training_step"] + #SemanticSegmentationTask._training_step_end = __HooksRegistry__[cfg.task]["training_step_end"] + #SemanticSegmentationTask._on_training_epoch_end = __HooksRegistry__[cfg.task]["on_training_epoch_end"] + #SemanticSegmentationTask._validation_step = __HooksRegistry__[cfg.task]["validation_step"] + #SemanticSegmentationTask._validation_step_end = __HooksRegistry__[cfg.task]["validation_step_end"] + #SemanticSegmentationTask._on_validation_epoch_end = __HooksRegistry__[cfg.task]["on_validation_epoch_end"] + #SemanticSegmentationTask._test_step = __HooksRegistry__[cfg.task]["test_step"] + #SemanticSegmentationTask._test_step_end = __HooksRegistry__[cfg.task]["test_step_end"] + #SemanticSegmentationTask._on_test_epoch_end = __HooksRegistry__[cfg.task]["on_test_epoch_end"] # Aggregation utilities - self.gather_on_step = gather_on_step - self.gather_on_epoch = gather_on_epoch + #self.gather_on_step = gather_on_step + #self.gather_on_epoch = gather_on_epoch # PyTorch-Lightning's model summary does not give the # correct number of trainable parameters; see @@ -102,8 +114,7 @@ def configure_optimizers(self) -> Tuple[dict]: `validation_step()`. https://github.com/PyTorchLightning/pytorch-lightning/issues/9811 - """ - + """ def training_step(self, batch, batch_idx): """ Hook for `training_step`. @@ -113,20 +124,58 @@ def training_step(self, batch, batch_idx): batch : object containing the data output by the dataloader. batch_idx : index of batch """ - return self._training_step(batch, batch_idx) + + # convert batch to dictionary form + batch = dictionarify_batch(batch, self.cfg.data.dataset) + + # forward pass + outputs = self.model(batch["inputs"]) + outputs = process_model_outputs(outputs, self.model) + + # loss + train_loss = self.loss(outputs, batch["masks"]) + + if "train_loss" not in self.training_step_outputs: + self.training_step_outputs["train_loss"] = [] + self.training_step_outputs["train_loss"].append(train_loss) + + # metrics + self.metrics["Accuracy_train"].update(preds=outputs, target=batch["masks"]) + + # the output is not used but returning None gives the following warning + # """lightning/pytorch/loops/optimization/automatic.py:129: + # UserWarning: `training_step` returned `None`. If this was + # on purpose, ignore this warning...""" + return {"loss": train_loss} def training_step_end(self): """ Hook for `training_step_end`. """ - self._training_step_end() + if self.global_step % self.cfg.logger.log_every_n_steps == 0: + + metrics = {} + + metrics["train_loss"] = torch.stack(self.training_step_outputs["train_loss"]).mean() + self.training_step_outputs.clear() # free memory + + # accuracy (batch only) + metrics["train_acc"] = self.metrics["Accuracy_train"].compute() + self.metrics["Accuracy_train"].reset() + + # log learning rate + #metrics['lr'] = self.lr_schedulers().get_last_lr()[0] + + # log training metrics + metrics[self.step_label] = self.global_step + self.logger.log_metrics(metrics) def on_training_epoch_end(self): """ Hook for `on_training_epoch_end`. """ - self._on_training_epoch_end() - + pass + def validation_step(self, batch, batch_idx): """ Hook for `validation_step`. @@ -134,42 +183,168 @@ def validation_step(self, batch, batch_idx): Parameters ---------- batch : object containing the data output by the dataloader. - batch_idx : index of batch. - + batch_idx : index of batch """ - return self._validation_step(batch, batch_idx) + + # convert batch to dictionary form + batch = dictionarify_batch(batch, self.cfg.data.dataset) + + # forward pass + outputs = self.model(batch["inputs"]) + outputs = process_model_outputs(outputs, self.model) + preds = torch.argmax(outputs, dim=1) + + ''' + for i in range(5): + print(batch["inputs_paths"][i]) + print(batch["masks_paths"][i]) + save_image(preds[0].unsqueeze(0).float(), fp=f"/Users/pme/Downloads/segm/mask_step{self.global_step}.jpeg") + i += 1 + ''' + + # loss + val_loss = self.loss(outputs, batch["masks"]) + + if "val_loss" not in self.validation_step_outputs: + self.validation_step_outputs["val_loss"] = [] + self.validation_step_outputs["val_loss"].append(val_loss) + + # metrics + self.metrics["Accuracy_val"].update(preds = preds, target = batch["masks"]) + #self.metrics["ConfusionMatrix_val"].update(preds = preds, target = batch["masks"]) + #self.metrics["PrecisionRecallCurve_val"].update(preds = outputs, target = batch["masks"]) - def validation_step_end(self): + def validation_step_end__SemanticSegmentation(self): """ Hook for `validation_step_end`. """ - return self._validation_step_end() + pass - def on_validation_epoch_end(self): - """ Hook for `validation_epoch_end`. + def on_validation_epoch_end__SemanticSegmentation(self): + """ Hook for `on_validation_epoch_end`. """ - self._on_validation_epoch_end() - - def test_step(self, batch, batch_idx): + #TODO confirm on multi-gpu + #print('\nself.validation_step_outputs["val_loss"]', len(self.validation_step_outputs["val_loss"]), '\n') + + metrics = {} + metrics["val_loss"] = torch.stack(self.validation_step_outputs["val_loss"]).mean() + self.validation_step_outputs.clear() # free memory + + # accuracy + metrics["val_acc"] = self.metrics["Accuracy_val"].compute() + self.metrics["Accuracy_val"].reset() + + # confusion matrix + ''' + cm = self.metrics["ConfusionMatrix_val"].compute() + figure = self.metrics["ConfusionMatrix_val"].draw( + confusion_matrix=cm, subset="val", epoch=self.current_epoch+1) + metrics["val_confusion_matrix"] = wandb.Image(figure, + caption=f"Confusion Matrix [val, epoch {self.current_epoch+1}]") + self.metrics["ConfusionMatrix_val"].reset() + ''' + + # precision-recall + ''' + precision, recall, thresholds = self.metrics["PrecisionRecallCurve_val"].compute() + figure = self.metrics["PrecisionRecallCurve_val"].draw( + precision=precision, recall=recall, thresholds=thresholds, + subset="val", epoch=self.current_epoch+1) + metrics["val_precision_recall"] = wandb.Image(figure, + caption=f"Precision-Recall Curve [val, epoch {self.current_epoch+1}]") + self.metrics["PrecisionRecallCurve_val"].reset() + ''' + + # log validation metrics + metrics[self.step_label] = self.global_step + if not self.sanity_check: + self.logger.log_metrics(metrics) + self.sanity_check = False + + # The following is required for EarlyStopping and ModelCheckpoint callbacks to work properly. + # Callbacks read from `self.log()`, not from `self.logger.log()`, so need to log there. + # [EarlyStopping] key `m = self.cfg.train.early_stop_metric` must exist in `metrics` + if self.cfg.train.early_stop_metric is not None: + m_earlystop = self.cfg.train.early_stop_metric + self.log(m_earlystop, metrics[m_earlystop], sync_dist=True) + # [ModelCheckpoint] key `m = self.cfg.train.ckpt_monitor_metric` must exist in `metrics` + if self.cfg.train.ckpt_monitor_metric is not None: + m_checkpoint = self.cfg.train.ckpt_monitor_metric + self.log(m_checkpoint, metrics[m_checkpoint], sync_dist=True) + + + def test_step__SemanticSegmentation(self, batch, batch_idx): """ Hook for `test_step`. Parameters ---------- - batch : object containing the data output by the dataloader. + batch : object containing the data output by the dataloader. batch_idx: index of batch. """ - return self._test_step(batch, batch_idx) + # convert batch to dictionary form + batch = dictionarify_batch(batch, self.cfg.data.dataset) + + # forward pass + outputs = self.model(batch["inputs"]) + outputs = process_model_outputs(outputs, self.model) + preds = torch.argmax(outputs, dim=1) + + # loss + test_loss = self.loss(outputs, batch["masks"]) + + if "test_loss" not in self.test_step_outputs: + self.test_step_outputs["test_loss"] = [] + self.test_step_outputs["test_loss"].append(test_loss) - def test_step_end(self): + # metrics + self.metrics["Accuracy_test"].update(preds = preds, target = batch["masks"]) + #self.metrics["ConfusionMatrix_test"].update(preds = preds, target = batch["masks"]) + #self.metrics["PrecisionRecallCurve_test"].update(preds = outputs, target = batch["masks"]) + + + def test_step_end__SemanticSegmentation(self): """ Hook for `test_step_end`. """ - return self._test_step_end() + pass - def on_test_epoch_end(self): + def on_test_epoch_end__SemanticSegmentation(self): """ Hook for `on_test_epoch_end`. """ - self._on_test_epoch_end() \ No newline at end of file + + metrics = {} + metrics["test_loss"] = torch.stack(self.test_step_outputs["test_loss"]).mean() + self.test_step_outputs.clear() # free memory + + # accuracy + metrics["test_acc"] = self.metrics["Accuracy_test"].compute() + self.metrics["Accuracy_test"].reset() + + # confusion matrix + ''' + cm = self.metrics["ConfusionMatrix_test"].compute() + figure = self.metrics["ConfusionMatrix_test"].draw( + confusion_matrix=cm, subset="test", epoch=self.current_epoch+1) + metrics["test_confusion_matrix"] = wandb.Image(figure, + caption=f"Confusion Matrix [test, epoch {self.current_epoch+1}]") + self.metrics["ConfusionMatrix_test"].reset() + ''' + + # precision-recall + ''' + precision, recall, thresholds = self.metrics["PrecisionRecallCurve_test"].compute() + figure = self.metrics["PrecisionRecallCurve_test"].draw( + precision=precision, recall=recall, thresholds=thresholds, + subset="test", epoch=self.current_epoch+1) + metrics["test_precision_recall"] = wandb.Image(figure, + caption=f"Precision-Recall Curve [test, epoch {self.current_epoch+1}]") + self.metrics["PrecisionRecallCurve_test"].reset() + ''' + + # log test metrics + metrics[self.step_label] = self.global_step + self.logger.log_metrics(metrics) + diff --git a/deeplightning/trainer/batch.py b/deeplightning/trainer/batch.py index 2fe3a04..024723a 100755 --- a/deeplightning/trainer/batch.py +++ b/deeplightning/trainer/batch.py @@ -1,7 +1,7 @@ from typing import Any -def dictionarify_batch(batch: Any, dataset: str): +def dictionarify_batch(batch: Any, dataset: str) -> dict: """Convert batch to dictionary format. Typically keys in this dictionary would be `inputs`, `targets` From 6dc1bbf88e4f921e6d8c82a5ff28d39a95a9c59d Mon Sep 17 00:00:00 2001 From: pme0 <12113751+pme0@users.noreply.github.com> Date: Sat, 4 Nov 2023 17:00:12 +0000 Subject: [PATCH 14/19] remove deprecated hooks --- deeplightning/task/base.py | 12 ------------ deeplightning/task/vision/segmentation.py | 10 +++++----- 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/deeplightning/task/base.py b/deeplightning/task/base.py index cdc0bd8..28c5c93 100644 --- a/deeplightning/task/base.py +++ b/deeplightning/task/base.py @@ -100,10 +100,6 @@ def training_step(self, batch, batch_idx): raise NotImplementedError - def training_step_end(self): - raise NotImplementedError - - def on_training_epoch_end(self): raise NotImplementedError @@ -112,10 +108,6 @@ def validation_step(self, batch, batch_idx): raise NotImplementedError - def validation_step_end(self): - raise NotImplementedError - - def on_validation_epoch_end(self): raise NotImplementedError @@ -124,9 +116,5 @@ def test_step(self, batch, batch_idx): raise NotImplementedError - def test_step_end(self): - raise NotImplementedError - - def on_test_epoch_end(self): raise NotImplementedError \ No newline at end of file diff --git a/deeplightning/task/vision/segmentation.py b/deeplightning/task/vision/segmentation.py index 9c40876..145ac6e 100755 --- a/deeplightning/task/vision/segmentation.py +++ b/deeplightning/task/vision/segmentation.py @@ -215,13 +215,13 @@ def validation_step(self, batch, batch_idx): #self.metrics["PrecisionRecallCurve_val"].update(preds = outputs, target = batch["masks"]) - def validation_step_end__SemanticSegmentation(self): + def validation_step_end(self): """ Hook for `validation_step_end`. """ pass - def on_validation_epoch_end__SemanticSegmentation(self): + def on_validation_epoch_end(self): """ Hook for `on_validation_epoch_end`. """ @@ -275,7 +275,7 @@ def on_validation_epoch_end__SemanticSegmentation(self): self.log(m_checkpoint, metrics[m_checkpoint], sync_dist=True) - def test_step__SemanticSegmentation(self, batch, batch_idx): + def test_step(self, batch, batch_idx): """ Hook for `test_step`. Parameters @@ -305,13 +305,13 @@ def test_step__SemanticSegmentation(self, batch, batch_idx): #self.metrics["PrecisionRecallCurve_test"].update(preds = outputs, target = batch["masks"]) - def test_step_end__SemanticSegmentation(self): + def test_step_end(self): """ Hook for `test_step_end`. """ pass - def on_test_epoch_end__SemanticSegmentation(self): + def on_test_epoch_end(self): """ Hook for `on_test_epoch_end`. """ From 0e223b6ef37b9cbd2673ca3e2e23425212fd24e4 Mon Sep 17 00:00:00 2001 From: pme0 <12113751+pme0@users.noreply.github.com> Date: Sun, 5 Nov 2023 19:48:28 +0000 Subject: [PATCH 15/19] refactor hooks for lightning2.0 --- configs/SkinLesionSegmentation.yaml | 2 +- .../data/dataloaders/vision/ham10000.py | 37 ++++----- deeplightning/task/base.py | 24 +++++- deeplightning/task/vision/segmentation.py | 82 ++++--------------- 4 files changed, 58 insertions(+), 87 deletions(-) diff --git a/configs/SkinLesionSegmentation.yaml b/configs/SkinLesionSegmentation.yaml index d8c9b40..c5e940c 100755 --- a/configs/SkinLesionSegmentation.yaml +++ b/configs/SkinLesionSegmentation.yaml @@ -58,7 +58,7 @@ metrics: test: default train: - num_epochs: 10 + num_epochs: 2 val_every_n_epoch: 1 grad_accum_from_epoch: 0 grad_accum_every_n_batches: 1 diff --git a/deeplightning/data/dataloaders/vision/ham10000.py b/deeplightning/data/dataloaders/vision/ham10000.py index 7d1e77e..42317b2 100644 --- a/deeplightning/data/dataloaders/vision/ham10000.py +++ b/deeplightning/data/dataloaders/vision/ham10000.py @@ -27,19 +27,17 @@ def _extract_masks(metadata, root): class HAM10000_dataset(Dataset): - """HAM10000 Dataset ("Human Against Machine with 10000 training images") - for Image Classification and Semantic Segmentation. + """HAM10000 Dataset for Image Classification and Semantic Segmentation. It contains dermatoscopic images from different populations, acquired and stored by different modalities. Cases include a representative collection of all important diagnostic categories in the realm of pigmented lesions. - Statistics & Details - -------------------- - - images and segmentation masks size: (width,height)=(600,450) - - normalization constants for images: mean=(?,) and std=(?,) - - number of samples: 10015 - - number of image classes: 7 - - number of segmentation classes: 7 () + images and segmentation masks size: (width,height)=(600,450) + normalization constants for images: mean=(?,) and std=(?,) + number of samples: 10015 + number of image classes: 7 + number of segmentation classes: 7 + |-------|-------------|------------------------------------------------------------------------------------------------------------| | label | no. samples | description | |-------|-------------|------------------------------------------------------------------------------------------------------------| @@ -52,17 +50,16 @@ class HAM10000_dataset(Dataset): | VASC | 142 | vascular lesions (angiomas, angiokeratomas, pyogenic granulomas and hemorrhage) | |-------|-------------|------------------------------------------------------------------------------------------------------------| - References - ---------- - - Tschandl, P., Rosendahl, C., & Kittler, H. (2018). "The HAM10000 - dataset, a large collection of multi-source dermatoscopic images - of common pigmented skin lesions". Scientific data, 5(1), 1-9. - - https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/DBW86T - - Arguments - --------- - cfg : configuration object - transform : Transforms to be applied to images + References: + > "Human Against Machine with 10000 training images" + > https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/DBW86T + > Tschandl, P., Rosendahl, C., & Kittler, H. (2018). "The HAM10000 + dataset, a large collection of multi-source dermatoscopic images + of common pigmented skin lesions". Scientific data, 5(1), 1-9. + + Args: + cfg: configuration object + transform: Transforms to be applied to images """ def __init__(self, diff --git a/deeplightning/task/base.py b/deeplightning/task/base.py index 28c5c93..44990fd 100644 --- a/deeplightning/task/base.py +++ b/deeplightning/task/base.py @@ -73,6 +73,8 @@ def num_total_params(self) -> int: def set_num_model_params(self) -> None: + """Set the number of model parameters as attributes of the class. + """ self._num_trainable_params = sum( p.numel() for p in self.model.parameters() if p.requires_grad ) @@ -83,10 +85,12 @@ def set_num_model_params(self) -> None: def print_num_model_params(self) -> None: + """Print the number of model parameters. + """ info_message("Trainable model parameters: {:,d}".format(self.num_trainable_params)) info_message("Non-trainable model parameters: {:,d}".format(self.num_nontrainable_params)) info_message("Total model parameters: {:,d}".format(self.num_total_params)) - + def forward(self, x: Tensor) -> Tensor: raise NotImplementedError @@ -97,6 +101,12 @@ def configure_optimizers(self) -> Tuple[dict]: def training_step(self, batch, batch_idx): + """ Lightning hook for training step. + + Args: + batch: object containing the data output by the dataloader. + batch_idx: index of batch + """ raise NotImplementedError @@ -105,6 +115,12 @@ def on_training_epoch_end(self): def validation_step(self, batch, batch_idx): + """ Lightning hook for validation step. + + Args: + batch: object containing the data output by the dataloader. + batch_idx: index of batch + """ raise NotImplementedError @@ -113,6 +129,12 @@ def on_validation_epoch_end(self): def test_step(self, batch, batch_idx): + """ Lightning hook for test step. + + Args: + batch: object containing the data output by the dataloader. + batch_idx: index of batch + """ raise NotImplementedError diff --git a/deeplightning/task/vision/segmentation.py b/deeplightning/task/vision/segmentation.py index 145ac6e..29b8356 100755 --- a/deeplightning/task/vision/segmentation.py +++ b/deeplightning/task/vision/segmentation.py @@ -2,11 +2,14 @@ from omegaconf import OmegaConf import torch from torch import Tensor +from torchvision.utils import save_image + import lightning as pl +from lightning.pytorch.trainer.states import RunningStage from deeplightning.init.imports import init_obj_from_config -from deeplightning.init.initializers import init_metrics -from deeplightning.trainer.gather import gather_on_step, gather_on_epoch +#from deeplightning.init.initializers import init_metrics +#from deeplightning.trainer.gather import gather_on_step, gather_on_epoch from deeplightning.utils.messages import info_message from deeplightning.registry import __HooksRegistry__ from deeplightning.utils.metrics import classification_accuracy @@ -45,7 +48,7 @@ def __init__(self, cfg: OmegaConf): # PyTorch-Lightning performs a partial validation epoch to ensure that # everything is correct. Use this to avoid logging metrics to W&B for that - self.sanity_check = True + #self.sanity_check = True # Initialise metrics to track during training torch_device = torch.device("cuda") if cfg.engine.accelerator == "gpu" else torch.device('cpu') @@ -117,13 +120,6 @@ def configure_optimizers(self) -> Tuple[dict]: """ def training_step(self, batch, batch_idx): - """ Hook for `training_step`. - - Parameters - ---------- - batch : object containing the data output by the dataloader. - batch_idx : index of batch - """ # convert batch to dictionary form batch = dictionarify_batch(batch, self.cfg.data.dataset) @@ -142,49 +138,33 @@ def training_step(self, batch, batch_idx): # metrics self.metrics["Accuracy_train"].update(preds=outputs, target=batch["masks"]) - # the output is not used but returning None gives the following warning - # """lightning/pytorch/loops/optimization/automatic.py:129: - # UserWarning: `training_step` returned `None`. If this was - # on purpose, ignore this warning...""" - return {"loss": train_loss} - - - def training_step_end(self): - """ Hook for `training_step_end`. - """ if self.global_step % self.cfg.logger.log_every_n_steps == 0: metrics = {} - metrics["train_loss"] = torch.stack(self.training_step_outputs["train_loss"]).mean() self.training_step_outputs.clear() # free memory - # accuracy (batch only) metrics["train_acc"] = self.metrics["Accuracy_train"].compute() self.metrics["Accuracy_train"].reset() - # log learning rate #metrics['lr'] = self.lr_schedulers().get_last_lr()[0] - + # log training metrics metrics[self.step_label] = self.global_step self.logger.log_metrics(metrics) + # the output is not used but returning None gives the following warning + # """lightning/pytorch/loops/optimization/automatic.py:129: + # UserWarning: `training_step` returned `None`. If this was + # on purpose, ignore this warning...""" + return {"loss": train_loss} + def on_training_epoch_end(self): - """ Hook for `on_training_epoch_end`. - """ pass def validation_step(self, batch, batch_idx): - """ Hook for `validation_step`. - - Parameters - ---------- - batch : object containing the data output by the dataloader. - batch_idx : index of batch - """ # convert batch to dictionary form batch = dictionarify_batch(batch, self.cfg.data.dataset) @@ -194,13 +174,11 @@ def validation_step(self, batch, batch_idx): outputs = process_model_outputs(outputs, self.model) preds = torch.argmax(outputs, dim=1) - ''' for i in range(5): print(batch["inputs_paths"][i]) print(batch["masks_paths"][i]) - save_image(preds[0].unsqueeze(0).float(), fp=f"/Users/pme/Downloads/segm/mask_step{self.global_step}.jpeg") - i += 1 - ''' + torch.save(obj=preds[i], f=f"/Users/pme/Downloads/segm/mask_step{self.global_step}_i{i}.pt") + save_image(preds[i].unsqueeze(0).float(), fp=f"/Users/pme/Downloads/segm/mask_step{self.global_step}_i{i}.jpeg") # loss val_loss = self.loss(outputs, batch["masks"]) @@ -215,18 +193,7 @@ def validation_step(self, batch, batch_idx): #self.metrics["PrecisionRecallCurve_val"].update(preds = outputs, target = batch["masks"]) - def validation_step_end(self): - """ Hook for `validation_step_end`. - """ - pass - - def on_validation_epoch_end(self): - """ Hook for `on_validation_epoch_end`. - """ - - #TODO confirm on multi-gpu - #print('\nself.validation_step_outputs["val_loss"]', len(self.validation_step_outputs["val_loss"]), '\n') metrics = {} metrics["val_loss"] = torch.stack(self.validation_step_outputs["val_loss"]).mean() @@ -259,9 +226,9 @@ def on_validation_epoch_end(self): # log validation metrics metrics[self.step_label] = self.global_step - if not self.sanity_check: + if self.trainer.state.stage != RunningStage.SANITY_CHECKING: # `and self.global_step > 0` self.logger.log_metrics(metrics) - self.sanity_check = False + #self.sanity_check = False # The following is required for EarlyStopping and ModelCheckpoint callbacks to work properly. # Callbacks read from `self.log()`, not from `self.logger.log()`, so need to log there. @@ -276,13 +243,6 @@ def on_validation_epoch_end(self): def test_step(self, batch, batch_idx): - """ Hook for `test_step`. - - Parameters - ---------- - batch : object containing the data output by the dataloader. - batch_idx: index of batch. - """ # convert batch to dictionary form batch = dictionarify_batch(batch, self.cfg.data.dataset) @@ -303,17 +263,9 @@ def test_step(self, batch, batch_idx): self.metrics["Accuracy_test"].update(preds = preds, target = batch["masks"]) #self.metrics["ConfusionMatrix_test"].update(preds = preds, target = batch["masks"]) #self.metrics["PrecisionRecallCurve_test"].update(preds = outputs, target = batch["masks"]) - - - def test_step_end(self): - """ Hook for `test_step_end`. - """ - pass def on_test_epoch_end(self): - """ Hook for `on_test_epoch_end`. - """ metrics = {} metrics["test_loss"] = torch.stack(self.test_step_outputs["test_loss"]).mean() From 8a34bca2157bd14718c963b7ec8be3f0e65defa1 Mon Sep 17 00:00:00 2001 From: pme0 <12113751+pme0@users.noreply.github.com> Date: Sun, 5 Nov 2023 21:16:53 +0000 Subject: [PATCH 16/19] cleanup --- deeplightning/task/vision/segmentation.py | 39 ++--------------------- 1 file changed, 3 insertions(+), 36 deletions(-) diff --git a/deeplightning/task/vision/segmentation.py b/deeplightning/task/vision/segmentation.py index 29b8356..87eae9a 100755 --- a/deeplightning/task/vision/segmentation.py +++ b/deeplightning/task/vision/segmentation.py @@ -36,57 +36,24 @@ class SemanticSegmentationTask(BaseTask): """ def __init__(self, cfg: OmegaConf): super().__init__(cfg=cfg) + self.loss = init_obj_from_config(cfg.model.loss) self.model = init_obj_from_config(cfg.model.network) self.optimizer = init_obj_from_config(cfg.model.optimizer, self.model.parameters()) self.scheduler = init_obj_from_config(cfg.model.scheduler, self.optimizer) - # migration from `pytorch_lightning==1.5.10` to `lightning==2.0.0` self.training_step_outputs = {"train_loss": []} self.validation_step_outputs = {"val_loss": []} self.test_step_outputs = {"test_loss": []} - # PyTorch-Lightning performs a partial validation epoch to ensure that - # everything is correct. Use this to avoid logging metrics to W&B for that - #self.sanity_check = True - - # Initialise metrics to track during training - torch_device = torch.device("cuda") if cfg.engine.accelerator == "gpu" else torch.device('cpu') - - #self.metrics = init_metrics(cfg, device=torch_device) self.metrics = { "Accuracy_train": classification_accuracy(cfg), "Accuracy_val": classification_accuracy(cfg), "Accuracy_test": classification_accuracy(cfg), } - # Initialise label to track metrics against - self.step_label = "iteration" - - # Define hook functions - # to make the hooks bound to the class (so that they can access class attributes - # using `self.something`), the assignment must specify the class name as follows: - # `ClassName.fn = my_fn` rather than `self.fn = my_fn` - #SemanticSegmentationTask._training_step = __HooksRegistry__[cfg.task]["training_step"] - #SemanticSegmentationTask._training_step_end = __HooksRegistry__[cfg.task]["training_step_end"] - #SemanticSegmentationTask._on_training_epoch_end = __HooksRegistry__[cfg.task]["on_training_epoch_end"] - #SemanticSegmentationTask._validation_step = __HooksRegistry__[cfg.task]["validation_step"] - #SemanticSegmentationTask._validation_step_end = __HooksRegistry__[cfg.task]["validation_step_end"] - #SemanticSegmentationTask._on_validation_epoch_end = __HooksRegistry__[cfg.task]["on_validation_epoch_end"] - #SemanticSegmentationTask._test_step = __HooksRegistry__[cfg.task]["test_step"] - #SemanticSegmentationTask._test_step_end = __HooksRegistry__[cfg.task]["test_step_end"] - #SemanticSegmentationTask._on_test_epoch_end = __HooksRegistry__[cfg.task]["on_test_epoch_end"] - - # Aggregation utilities - #self.gather_on_step = gather_on_step - #self.gather_on_epoch = gather_on_epoch - - # PyTorch-Lightning's model summary does not give the - # correct number of trainable parameters; see - # https://github.com/PyTorchLightning/pytorch-lightning/issues/12130 - self.trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad) - info_message("Trainable parameters: {:,d}".format(self.trainable_params)) - + self.on_task_init_end() + def forward(self, x: Tensor) -> Tensor: """ Model forward pass. From c9d103c8c9ce3e191a8b062affead4c96cdea2df Mon Sep 17 00:00:00 2001 From: pme0 <12113751+pme0@users.noreply.github.com> Date: Mon, 6 Nov 2023 22:21:30 +0000 Subject: [PATCH 17/19] . --- configs/SkinLesionSegmentation.yaml | 4 ++-- deeplightning/task/vision/segmentation.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/SkinLesionSegmentation.yaml b/configs/SkinLesionSegmentation.yaml index c5e940c..f9177b1 100755 --- a/configs/SkinLesionSegmentation.yaml +++ b/configs/SkinLesionSegmentation.yaml @@ -18,7 +18,7 @@ data: normalize: # use `deeplightning.utils.data.compute_dataset_mean_and_stdev()` mean: [0., 0., 0.] std: [1., 1., 1.] - resize: [32, 32] + resize: [128, 128] test_transforms: normalize: mean: [0., 0., 0.] @@ -58,7 +58,7 @@ metrics: test: default train: - num_epochs: 2 + num_epochs: 20 val_every_n_epoch: 1 grad_accum_from_epoch: 0 grad_accum_every_n_batches: 1 diff --git a/deeplightning/task/vision/segmentation.py b/deeplightning/task/vision/segmentation.py index 87eae9a..dd5a5bc 100755 --- a/deeplightning/task/vision/segmentation.py +++ b/deeplightning/task/vision/segmentation.py @@ -145,7 +145,7 @@ def validation_step(self, batch, batch_idx): print(batch["inputs_paths"][i]) print(batch["masks_paths"][i]) torch.save(obj=preds[i], f=f"/Users/pme/Downloads/segm/mask_step{self.global_step}_i{i}.pt") - save_image(preds[i].unsqueeze(0).float(), fp=f"/Users/pme/Downloads/segm/mask_step{self.global_step}_i{i}.jpeg") + save_image(preds[i].unsqueeze(0).float(), fp=f"/Users/pme/Downloads/segm/{batch['masks_paths'][i]}_pred_step{self.global_step}.png") # loss val_loss = self.loss(outputs, batch["masks"]) From 3d3a4b42fd487819ebee49ca507ed8d66dbca48c Mon Sep 17 00:00:00 2001 From: pme0 <12113751+pme0@users.noreply.github.com> Date: Sun, 12 Nov 2023 21:01:20 +0000 Subject: [PATCH 18/19] rename sound -> audio --- deeplightning/data/dataloaders/{sound => audio}/__init__.py | 0 deeplightning/data/dataloaders/{sound => audio}/fsd.py | 0 deeplightning/task/{sound => audio}/__init__.py | 0 deeplightning/task/{sound => audio}/classification.py | 0 deeplightning/viz/{sound => audio}/__init__.py | 0 deeplightning/viz/{sound => audio}/spectrum.py | 0 deeplightning/viz/{sound => audio}/waveform.py | 0 7 files changed, 0 insertions(+), 0 deletions(-) rename deeplightning/data/dataloaders/{sound => audio}/__init__.py (100%) rename deeplightning/data/dataloaders/{sound => audio}/fsd.py (100%) rename deeplightning/task/{sound => audio}/__init__.py (100%) rename deeplightning/task/{sound => audio}/classification.py (100%) rename deeplightning/viz/{sound => audio}/__init__.py (100%) rename deeplightning/viz/{sound => audio}/spectrum.py (100%) rename deeplightning/viz/{sound => audio}/waveform.py (100%) diff --git a/deeplightning/data/dataloaders/sound/__init__.py b/deeplightning/data/dataloaders/audio/__init__.py similarity index 100% rename from deeplightning/data/dataloaders/sound/__init__.py rename to deeplightning/data/dataloaders/audio/__init__.py diff --git a/deeplightning/data/dataloaders/sound/fsd.py b/deeplightning/data/dataloaders/audio/fsd.py similarity index 100% rename from deeplightning/data/dataloaders/sound/fsd.py rename to deeplightning/data/dataloaders/audio/fsd.py diff --git a/deeplightning/task/sound/__init__.py b/deeplightning/task/audio/__init__.py similarity index 100% rename from deeplightning/task/sound/__init__.py rename to deeplightning/task/audio/__init__.py diff --git a/deeplightning/task/sound/classification.py b/deeplightning/task/audio/classification.py similarity index 100% rename from deeplightning/task/sound/classification.py rename to deeplightning/task/audio/classification.py diff --git a/deeplightning/viz/sound/__init__.py b/deeplightning/viz/audio/__init__.py similarity index 100% rename from deeplightning/viz/sound/__init__.py rename to deeplightning/viz/audio/__init__.py diff --git a/deeplightning/viz/sound/spectrum.py b/deeplightning/viz/audio/spectrum.py similarity index 100% rename from deeplightning/viz/sound/spectrum.py rename to deeplightning/viz/audio/spectrum.py diff --git a/deeplightning/viz/sound/waveform.py b/deeplightning/viz/audio/waveform.py similarity index 100% rename from deeplightning/viz/sound/waveform.py rename to deeplightning/viz/audio/waveform.py From 87f87d6924a537ef035adf294a2f261c50582fd1 Mon Sep 17 00:00:00 2001 From: pme0 <12113751+pme0@users.noreply.github.com> Date: Sun, 12 Nov 2023 21:08:00 +0000 Subject: [PATCH 19/19] rename sound -> audio --- examples/audio/audio_classification_cnn/generate_media.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/audio/audio_classification_cnn/generate_media.py b/examples/audio/audio_classification_cnn/generate_media.py index b4e2bfb..18934d4 100644 --- a/examples/audio/audio_classification_cnn/generate_media.py +++ b/examples/audio/audio_classification_cnn/generate_media.py @@ -13,8 +13,8 @@ sys.path.append(parent2) print(f"Added to system path: '{parent2}'") -from deeplightning.viz.sound.waveform import waveplot -from deeplightning.viz.sound.spectrum import spectrogram +from deeplightning.viz.audio.waveform import waveplot +from deeplightning.viz.audio.spectrum import spectrogram def parse_args():