Skip to content

Commit

Permalink
Merge pull request #66 from hmorimitsu/rapidflow
Browse files Browse the repository at this point in the history
Adapt RAPIDFlow code to TensorRT and add simple test script
  • Loading branch information
hmorimitsu authored May 28, 2024
2 parents f42066c + 2831fc4 commit f5c7d6b
Show file tree
Hide file tree
Showing 7 changed files with 280 additions and 16 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,4 @@ jobs:
mv ptlflow ptlflow_tmp
- name: Test with pytest
run: |
python -m pytest
python -m pytest tests/
2 changes: 1 addition & 1 deletion .github/workflows/lightning.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,4 @@ jobs:
- name: Test with pytest
run: |
pip install pytest
python -m pytest
python -m pytest tests/
2 changes: 1 addition & 1 deletion .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@ jobs:
- name: Test with pytest
run: |
pip install pytest
python -m pytest
python -m pytest tests/
2 changes: 1 addition & 1 deletion .github/workflows/pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@ jobs:
- name: Test with pytest
run: |
pip install pytest
python -m pytest
python -m pytest tests/
14 changes: 11 additions & 3 deletions ptlflow/models/rapidflow/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,19 @@ You can also provide your own images to test by providing an additional argument
python onnx_infer.py rapidflow_it12.onnx --image_paths /path/to/first/image /path/to/second/image
```

### ONNX example limitations
## Compiling model to TensorRT

Directly converting the model to ONNX as shown in this example will work, but it is not optimal.
The script [tensorrt_test.py](tensorrt_test.py) provides a simple example of how to compile RAPIDFlow models to TensorRT.
Run it by typing:
```bash
python tensorrt_test.py rapidflow_it12 --checkpoint things
```

### ONNX and TensorRT example limitations

Directly converting the model to ONNX and TensorRT as shown in this example will work, but it is not optimal.
To obtain the best convertion, it would be necessary to rewrite some parts of the code to remove conditions and operations that may change according to the input size.
Also, ONNX convertion only supports `--corr_mode allpairs`, which is not suitable for large images.
Also, these convertions only supports `--corr_mode allpairs`, which is not suitable for large images.

## Code license

Expand Down
75 changes: 66 additions & 9 deletions ptlflow/models/rapidflow/rapidflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import torch.nn.functional as F

from ptlflow.utils.utils import forward_interpolate_batch
from .pwc_modules import rescale_flow, upsample2d_as
from .pwc_modules import rescale_flow
from .update import UpdateBlock
from .corr import get_corr_block
from .local_timm.norm import LayerNorm2d
Expand Down Expand Up @@ -353,8 +353,11 @@ def forward(self, inputs):
and "prev_flows" in inputs
and inputs["prev_flows"] is not None
):
flow = upsample2d_as(
inputs["prev_flows"][:, 0], pass_pyramid1[0], mode="bilinear"
flow = F.interpolate(
inputs["prev_flows"][:, 0],
[pass_pyramid1[0].shape[-2], pass_pyramid1[0].shape[-1]],
mode="bilinear",
align_corners=True,
)
flow = rescale_flow(flow, width_im, height_im, to_local=True)
flow = forward_interpolate_batch(flow)
Expand Down Expand Up @@ -385,7 +388,12 @@ def forward(self, inputs):
if net is None:
net = torch.tanh(net_tmp)
else:
net = upsample2d_as(net, x1, mode="bilinear")
net = F.interpolate(
net,
[x1.shape[-2], x1.shape[-1]],
mode="bilinear",
align_corners=True,
)

net_skip = torch.tanh(net_tmp)
gate = torch.sigmoid(
Expand All @@ -395,7 +403,12 @@ def forward(self, inputs):

if l > 0:
flow = rescale_flow(flow, x1.shape[-1], x1.shape[-2], to_local=False)
flow = upsample2d_as(flow, x1, mode="bilinear")
flow = F.interpolate(
flow,
[x1.shape[-2], x1.shape[-1]],
mode="bilinear",
align_corners=True,
)

for k in range(iters_per_level[l]):
flow = flow.detach()
Expand All @@ -414,16 +427,60 @@ def forward(self, inputs):
out_flow = rescale_flow(flow, width_im, height_im, to_local=False)
if self.training:
if mask is not None and l == (output_level - start_level):
out_flow = self.upsample_flow(out_flow, mask, pred_stride)
if self.args.simple_io:
# Just copied the code from self.upsample_flow to here.
# For some reason, TensorRT backend does not compile when calling the function
N, _, H, W = out_flow.shape
mask = mask.view(N, 1, 9, pred_stride, pred_stride, H, W)
mask = torch.softmax(mask, dim=2)

up_flow = F.unfold(flow, [3, 3], padding=1)
up_flow = up_flow.view(N, 2, 9, 1, 1, H, W)

up_flow = torch.sum(mask * up_flow, dim=2)
up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)
up_flow = up_flow.reshape(
N, 2, pred_stride * H, pred_stride * W
)
out_flow = up_flow
else:
out_flow = self.upsample_flow(out_flow, mask, pred_stride)
else:
out_flow = upsample2d_as(out_flow, x1_raw, mode="bilinear")
out_flow = F.interpolate(
out_flow,
[x1_raw.shape[-2], x1_raw.shape[-1]],
mode="bilinear",
align_corners=True,
)
elif l == (output_level - start_level) and k == (
iters_per_level[l] - 1
):
if mask is not None:
out_flow = self.upsample_flow(out_flow, mask, pred_stride)
if self.args.simple_io:
# Just copied the code from self.upsample_flow to here.
# For some reason, TensorRT backend does not compile when calling the function
N, _, H, W = out_flow.shape
mask = mask.view(N, 1, 9, pred_stride, pred_stride, H, W)
mask = torch.softmax(mask, dim=2)

up_flow = F.unfold(flow, [3, 3], padding=1)
up_flow = up_flow.view(N, 2, 9, 1, 1, H, W)

up_flow = torch.sum(mask * up_flow, dim=2)
up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)
up_flow = up_flow.reshape(
N, 2, pred_stride * H, pred_stride * W
)
out_flow = up_flow
else:
out_flow = self.upsample_flow(out_flow, mask, pred_stride)
else:
out_flow = upsample2d_as(out_flow, x1_raw, mode="bilinear")
out_flow = F.interpolate(
out_flow,
[x1_raw.shape[-2], x1_raw.shape[-1]],
mode="bilinear",
align_corners=True,
)
out_flow = self.postprocess_predictions(
out_flow, image_resizer, is_flow=True
)
Expand Down
199 changes: 199 additions & 0 deletions ptlflow/models/rapidflow/tensorrt_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
# TensorRT conversion code comes from the tutorial:
# https://pytorch.org/TensorRT/tutorials/_rendered_examples/dynamo/torch_compile_resnet_example.html


import sys
from argparse import ArgumentParser
from pathlib import Path
import time

import cv2 as cv
import numpy as np
import torch
import torch_tensorrt

this_dir = Path(__file__).parent.resolve()
sys.path.insert(0, str(this_dir.parent.parent.parent))

from ptlflow import get_model, load_checkpoint
from ptlflow.models.rapidflow.rapidflow import RAPIDFlow
from ptlflow.utils import flow_utils


def _init_parser() -> ArgumentParser:
parser = ArgumentParser()
parser.add_argument(
"model",
type=str,
choices=(
"rapidflow",
"rapidflow_it1",
"rapidflow_it2",
"rapidflow_it3",
"rapidflow_it6",
"rapidflow_it12",
),
help="Name of the model to use.",
)
parser.add_argument(
"--checkpoint",
type=str,
default=None,
help="Path to the checkpoint to be loaded. It can also be one of the following names: \{chairs, things, sintel, kitti\}, in which case the respective pretrained checkpoint will be downloaded.",
)
parser.add_argument(
"--image_paths",
type=str,
nargs=2,
default=(
str(this_dir / "image_samples" / "000000_10.png"),
str(this_dir / "image_samples" / "000000_11.png"),
),
help="Path to two images to estimate the optical flow with the TensorRT model.",
)
parser.add_argument(
"--output_path",
type=str,
default=".",
help="Path to the directory where the predictions will be saved.",
)
parser.add_argument(
"--input_size",
type=int,
nargs=2,
default=(384, 1280),
help="Size of the input image.",
)
return parser


def compile_engine_and_infer(args):
# Initialize model with half precision and sample inputs
model = load_model(args).half().eval().to("cuda")
images = [torch.from_numpy(load_images(args.image_paths)).half().to("cuda")]

num_tries = 11
total_time_orig = 0.0
for i in range(num_tries):
torch.cuda.synchronize()
start = time.perf_counter()
model(images[0])
torch.cuda.synchronize()
end = time.perf_counter()
if i > 0:
total_time_orig += end - start

# Enabled precision for TensorRT optimization
enabled_precisions = {torch.half}

# Whether to print verbose logs
debug = True

# Workspace size for TensorRT
workspace_size = 20 << 30

# Maximum number of TRT Engines
# (Lower value allows more graph segmentation)
min_block_size = 7

# Operations to Run in Torch, regardless of converter support
torch_executed_ops = {}

# Build and compile the model with torch.compile, using Torch-TensorRT backend
compiled_model = torch_tensorrt.compile(
model,
ir="torch_compile",
inputs=images,
enabled_precisions=enabled_precisions,
debug=debug,
workspace_size=workspace_size,
min_block_size=min_block_size,
torch_executed_ops=torch_executed_ops,
)

total_time_optimized = 0.0
for i in range(num_tries):
torch.cuda.synchronize()
start = time.perf_counter()
flow_pred = compiled_model(*images)
torch.cuda.synchronize()
end = time.perf_counter()
if i > 0:
total_time_optimized += end - start

try:
torch_tensorrt.save(compiled_model, f"{args.model}.tc", inputs=images)
print(f"Saving compiled model to {args.model}.tc")
compiled_model = torch_tensorrt.load(f"{args.model}.tc")
print(f"Loading compiled model from {args.model}.tc")
except Exception as e:
print("WARNING: The compiled model was not saved due to the error:")
print(e)

print(f"Model: {args.model}. Average time of {num_tries - 1} runs:")
print(f"Time (original): {(1000 * total_time_orig / (num_tries - 1)):.2f} ms.")
print(f"Time (compiled): {(1000 * total_time_optimized / (num_tries - 1)):.2f} ms.")

flow_pred_npy = flow_pred[0].permute(1, 2, 0).detach().cpu().numpy()

output_dir = Path(args.output_path)
output_dir.mkdir(parents=True, exist_ok=True)

flo_output_path = output_dir / f"flow_pred.flo"
flow_utils.flow_write(flo_output_path, flow_pred_npy)
print(f"Saved flow prediction to: {flo_output_path}")

viz_output_path = output_dir / f"flow_pred_viz.png"
flow_viz = flow_utils.flow_to_rgb(flow_pred_npy)
cv.imwrite(str(viz_output_path), cv.cvtColor(flow_viz, cv.COLOR_RGB2BGR))
print(f"Saved flow prediction visualization to: {viz_output_path}")

# Finally, we use Torch utilities to clean up the workspace
torch._dynamo.reset()


def load_images(image_paths):
images = [cv.imread(p) for p in image_paths]
images = [cv.resize(im, args.input_size[::-1]) for im in images]
images = np.stack(images)
images = images.transpose(0, 3, 1, 2)[None]
images = images.astype(np.float32) / 255.0
return images


def load_model(args):
model = get_model(args.model, args=args)
ckpt = load_checkpoint(args.checkpoint, RAPIDFlow, "rapidflow")
state_dict = fuse_checkpoint_next1d_layers(ckpt["state_dict"])
model.load_state_dict(state_dict, strict=True)
return model


def fuse_checkpoint_next1d_layers(state_dict):
fused_sd = {}
hv_pairs = {}
for name, param in state_dict.items():
if name.endswith("weight_h") or name.endswith("weight_v"):
name_prefix = name[: -(len("weight_h") + 1)]
orientation = name[-1]
if name_prefix not in hv_pairs:
hv_pairs[name_prefix] = {}
hv_pairs[name_prefix][orientation] = param
else:
fused_sd[name] = param

for name_prefix, param_pairs in hv_pairs.items():
weight = torch.einsum("cijk,cimj->cimk", param_pairs["h"], param_pairs["v"])
fused_sd[f"{name_prefix}.weight"] = weight
return fused_sd


if __name__ == "__main__":
parser = _init_parser()
parser = RAPIDFlow.add_model_specific_args(parser)
args = parser.parse_args()
args.corr_mode = "allpairs"
args.fuse_next1d_weights = True
args.simple_io = True

compile_engine_and_infer(args)

0 comments on commit f5c7d6b

Please sign in to comment.