max_steps has no effect in combination with gradient accumulation #4193

awaelchli · 2020-10-16T13:08:16Z

🐛 Bug

import os
import torch
from torch.utils.data import Dataset
from pytorch_lightning import Trainer, LightningModule


class RandomDataset(Dataset):
    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.len


class BoringModel(LightningModule):

    def __init__(self):
        """
        Testing PL Module

        Use as follows:
        - subclass
        - modify the behavior for what you want

        class TestModel(BaseTestModel):
            def training_step(...):
                # do your own thing

        or:

        model = BaseTestModel()
        model.training_epoch_end = None

        """
        super().__init__()
        self.layer = torch.nn.Linear(32, 2)

    def forward(self, x):
        return self.layer(x)

    def loss(self, batch, prediction):
        # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls
        return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction))

    def step(self, x):
        x = self.layer(x)
        out = torch.nn.functional.mse_loss(x, torch.ones_like(x))
        return out

    def training_step(self, batch, batch_idx):
        output = self.layer(batch)
        loss = self.loss(batch, output)
        return {"loss": loss}

    def training_step_end(self, training_step_outputs):
        return training_step_outputs

    def training_epoch_end(self, outputs) -> None:
        torch.stack([x["loss"] for x in outputs]).mean()

    def validation_step(self, batch, batch_idx):
        output = self.layer(batch)
        loss = self.loss(batch, output)
        return {"x": loss}

    def validation_epoch_end(self, outputs) -> None:
        torch.stack([x['x'] for x in outputs]).mean()

    def test_step(self, batch, batch_idx):
        output = self.layer(batch)
        loss = self.loss(batch, output)
        return {"y": loss}

    def test_epoch_end(self, outputs) -> None:
        torch.stack([x["y"] for x in outputs]).mean()

    def configure_optimizers(self):
        optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
        return [optimizer], [lr_scheduler]


def run_test():
    class TestModel(BoringModel):

        def on_train_epoch_start(self) -> None:
            print('override any method to prove your bug')

    # fake data
    train_data = torch.utils.data.DataLoader(RandomDataset(32, 64))
    val_data = torch.utils.data.DataLoader(RandomDataset(32, 64))
    test_data = torch.utils.data.DataLoader(RandomDataset(32, 64))

    # model
    model = TestModel()
    trainer = Trainer(
        default_root_dir=os.getcwd(),
        # -----------------------------------------------------------------------------
        max_steps=10,  # HERE HAS NO EFFECT IN COMBINATION WITH accumulate_grad_batches
        # -----------------------------------------------------------------------------
        accumulate_grad_batches=2,
        weights_summary=None,
    )
    trainer.fit(model, train_data, val_data)
    trainer.test(test_dataloaders=test_data)


if __name__ == '__main__':
    run_test()

This training runs for all 1000 epochs instead of just max steps.

Expected behavior

Training runs for max_steps.

Additional context

The tests with TODO marker can be found here #4190

edenlightning · 2020-10-20T20:49:41Z

@SeanNaren please take a look!

awaelchli added bug Something isn't working help wanted Open to be worked on labels Oct 16, 2020

awaelchli self-assigned this Oct 17, 2020

edenlightning added this to the 1.0.3 milestone Oct 19, 2020

edenlightning assigned SeanNaren and unassigned awaelchli Oct 20, 2020

SeanNaren mentioned this issue Oct 21, 2020

Fix bug comparing max_steps to global step which inits at 0 #4278

Merged

1 task

SeanNaren closed this as completed in #4278 Oct 22, 2020

SeanNaren mentioned this issue Oct 22, 2020

Protect functions not to be accessed by user #4305

Merged

1 task

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

max_steps has no effect in combination with gradient accumulation #4193

max_steps has no effect in combination with gradient accumulation #4193

awaelchli commented Oct 16, 2020

edenlightning commented Oct 20, 2020

max_steps has no effect in combination with gradient accumulation #4193

max_steps has no effect in combination with gradient accumulation #4193

Comments

awaelchli commented Oct 16, 2020

🐛 Bug

Expected behavior

Additional context

edenlightning commented Oct 20, 2020