diff --git a/benchmarks/benchmark_bitpacking.py b/benchmarks/benchmark_bitpacking.py
index 616567165c..9e3d57d508 100644
--- a/benchmarks/benchmark_bitpacking.py
+++ b/benchmarks/benchmark_bitpacking.py
@@ -21,11 +21,11 @@ def benchmark(function, args, num_runs):
 
 def test_vs_existing():
     def new_(scale):
-        fake_tensor = torch.randint(2**8-1, (1, scale,scale), dtype=torch.uint8).cuda()
+        fake_tensor = torch.randint(2**8, (1, scale,scale), dtype=torch.uint8).cuda()
         packed = pack(fake_tensor, 4, dim=1)
         unpacked = unpack(packed, 4, dim=1)
     def old_(scale):
-        fake_tensor = torch.randint(2**8-1, (1, scale,scale), dtype=torch.uint8).cuda()
+        fake_tensor = torch.randint(2**8, (1, scale,scale), dtype=torch.uint8).cuda()
         packed = pack_uint4(fake_tensor)
         unpacked = unpack_uint4(packed)
         
@@ -55,9 +55,9 @@ class W4A16_symmetric_weight_only(torch.nn.Module):
         def __init__(self, scale):
             super().__init__()
             assert scale % 4 == 0
-            self.l1 = torch.randint(2**8-1,(scale, scale), dtype=torch.uint8).cuda()
+            self.l1 = torch.randint(2**8,(scale, scale), dtype=torch.uint8).cuda()
             self.s1 = torch.tensor((scale),dtype=torch.float16).cuda()
-            self.l2 = torch.randint(2**8-1,(scale//2, scale//4), dtype=torch.uint8).cuda()
+            self.l2 = torch.randint(2**8,(scale//2, scale//4), dtype=torch.uint8).cuda()
             self.s2 = torch.tensor((scale//4),dtype=torch.float16).cuda()
 
         
@@ -79,7 +79,7 @@ def forward(self, x):
         b = torch.compile(b, fullgraph=True)
         
         test_input = torch.randn(scale*2, dtype=torch.float16).cuda()
-        forward_args = [test_input]
+        forward_args = [test_input] 
         b.forward(test_input)
         print("scale: ", scale)
         print("fp16 time: ", benchmark(a.forward, forward_args, 100))