add profile test

intel · Jan 23, 2025 · 9a25078 · 9a25078
1 parent c04452f
commit 9a25078
Show file tree

Hide file tree

Showing 7 changed files with 1,646 additions and 0 deletions.
diff --git a/.github/scripts/profile/correlation_id_mixed.py b/.github/scripts/profile/correlation_id_mixed.py
@@ -0,0 +1,12 @@
+import torch
+
+from torch.profiler import profile, record_function, ProfilerActivity
+
+input1 = torch.randn(3, 3, device='xpu')
+input2 = torch.randn(3, 3, device='xpu')
+
+with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.XPU],) as prof:
+    output1 = input1 + 1.0
+    output2 = input2 + 2.0
+    output = output1 + output2
+print(prof.key_averages().table(sort_by="xpu_time_total"))
diff --git a/.github/scripts/profile/profile_partial_runtime_ops.py b/.github/scripts/profile/profile_partial_runtime_ops.py
@@ -0,0 +1,16 @@
+import torch
+
+def compute(input1, input2):
+    input1 = input1.to(device='xpu')
+    return input1 + 1.0
+
+input1 = torch.randn(3,3,device='cpu')
+input2 = torch.randn(3,3,device='cpu')
+
+#warm
+output = compute(input1, input2)
+
+for id in range(1):
+    with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU,torch.profiler.ProfilerActivity.XPU,]) as p:
+        output = compute(input1, input2)
+    print(p.key_averages().table(sort_by="self_xpu_time_total", row_limit=-1))
diff --git a/.github/scripts/profile/reproducer.missing.gpu.kernel.time.py b/.github/scripts/profile/reproducer.missing.gpu.kernel.time.py
@@ -0,0 +1,60 @@
+import torch
+
+from torch.profiler import profile, record_function, ProfilerActivity
+
+def maxUnpool2d(shape, dtype, channels_last, backward):
+    N, C, H, W = int(shape[0]), int(shape[1]), int(shape[2]), int(shape[3])
+    kernel_size = 2
+
+    pool = torch.nn.MaxPool2d(kernel_size, return_indices=True)
+    unpool = torch.nn.MaxUnpool2d(kernel_size)
+    #torch.manual_seed(20)
+
+    if channels_last:
+        input = torch.randn([N, C, H, W]).to(memory_format=torch.channels_last).to(device="cpu", dtype=torch.float32)
+    else:
+        input = torch.randn([N, C, H, W]).to(device="cpu", dtype=torch.float32)
+    output, indices = pool(input)
+
+    #pool.to(device="xpu", dtype=dtype)
+    #unpool.to(device="xpu", dtype=dtype)
+    if channels_last:
+        x_dpcpp = output.to(memory_format=torch.channels_last).to(device="xpu", dtype=dtype)
+        indices_dpcpp = indices.to(memory_format=torch.channels_last).to(device="xpu", dtype=torch.int64)
+    else:
+        x_dpcpp = output.to(device="xpu", dtype=dtype)
+        indices_dpcpp = indices.to(device="xpu", dtype=torch.int64)
+
+    if backward:
+        x_dpcpp.requires_grad_(True)
+        if channels_last:
+            grad_dpcpp = torch.randn([N, C, H, W]).to(memory_format=torch.channels_last).to(device="xpu", dtype=dtype)
+        else:
+            grad_dpcpp = torch.randn([N, C, H, W]).to(device="xpu", dtype=dtype)
+
+    y_dpcpp = unpool(x_dpcpp, indices_dpcpp, output_size=torch.Size([N,C,H,W])).to("xpu")
+
+    if backward:
+        y_dpcpp.backward(grad_dpcpp)
+
+if __name__ == "__main__":
+    dtype = torch.bfloat16
+    dtype = torch.float32
+    backward = True
+    #for channels_last in [False, True]:
+    #    for shape in [[4,64,128,128],[4,65,128,128],[8,128,128,128]]:
+    for channels_last in [False]:
+        for shape in [[4,64,128,128]]:
+            print("======================================")
+            print("channels_last is %s, backward is %s, shape is %s" % (str(channels_last), str(backward),str(shape)))
+
+            # warm up
+            maxUnpool2d(shape, dtype, channels_last, backward=backward)
+            maxUnpool2d(shape, dtype, channels_last, backward=backward)
+            maxUnpool2d(shape, dtype, channels_last, backward=backward)
+
+            # go
+            with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.XPU],) as prof:
+                for i in range(1):
+                    maxUnpool2d(shape, dtype, channels_last, backward=backward)
+            print(prof.key_averages().table(sort_by="xpu_time_total"))