diff --git a/README.md b/README.md
index 85071d5..3208887 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 
 
 <div align="center">
-<img src="assets/taiji.gif" width="260" height="160"> <img src="assets/dancer3.gif" width="260" height="160">
+<img src="assets/hybrik_dance1.gif" height="160"> <img src="assets/hybrik_dance2.gif" height="160"> <img src="assets/hybrik_dance3.gif" height="160">
 </div>
 
 
diff --git a/assets/dancer1.gif b/assets/dancer1.gif
deleted file mode 100644
index fddf720..0000000
Binary files a/assets/dancer1.gif and /dev/null differ
diff --git a/assets/dancer2.gif b/assets/dancer2.gif
deleted file mode 100644
index 6b67a5c..0000000
Binary files a/assets/dancer2.gif and /dev/null differ
diff --git a/assets/dancer3.gif b/assets/dancer3.gif
deleted file mode 100644
index 124e4ff..0000000
Binary files a/assets/dancer3.gif and /dev/null differ
diff --git a/assets/hybrik_dance1.gif b/assets/hybrik_dance1.gif
new file mode 100644
index 0000000..a6f889c
Binary files /dev/null and b/assets/hybrik_dance1.gif differ
diff --git a/assets/hybrik_dance2.gif b/assets/hybrik_dance2.gif
new file mode 100644
index 0000000..884b19e
Binary files /dev/null and b/assets/hybrik_dance2.gif differ
diff --git a/assets/hybrik_dance3.gif b/assets/hybrik_dance3.gif
new file mode 100644
index 0000000..65dc4de
Binary files /dev/null and b/assets/hybrik_dance3.gif differ
diff --git a/configs/256x192_adam_lr1e-3-hrw48_cam_2x_w_pw3d_3dhp.yaml b/configs/256x192_adam_lr1e-3-hrw48_cam_2x_w_pw3d_3dhp.yaml
index 0d82d33..51946a0 100644
--- a/configs/256x192_adam_lr1e-3-hrw48_cam_2x_w_pw3d_3dhp.yaml
+++ b/configs/256x192_adam_lr1e-3-hrw48_cam_2x_w_pw3d_3dhp.yaml
@@ -48,13 +48,13 @@ MODEL:
     - 2200
     - 2200
 LOSS:
-  TYPE: 'L1LossDimSMPLCam'
+  TYPE: 'LaplaceLossDimSMPLCam'
   ELEMENTS:
     BETA_WEIGHT: 1
     BETA_REG_WEIGHT: 0
     PHI_REG_WEIGHT: 0.0001
     LEAF_REG_WEIGHT: 0
-    TWIST_WEIGHT: 0.01
+    TWIST_WEIGHT: 1
     THETA_WEIGHT: 0.01
     UVD24_WEIGHT: 1
     XYZ24_WEIGHT: 0
diff --git a/configs/256x192_adam_lr1e-3-res34_smpl_3d_cam_2x_mix.yaml b/configs/256x192_adam_lr1e-3-res34_smpl_3d_cam_2x_mix.yaml
index c97669b..aeba1f7 100644
--- a/configs/256x192_adam_lr1e-3-res34_smpl_3d_cam_2x_mix.yaml
+++ b/configs/256x192_adam_lr1e-3-res34_smpl_3d_cam_2x_mix.yaml
@@ -53,7 +53,7 @@ LOSS:
     BETA_REG_WEIGHT: 0
     PHI_REG_WEIGHT: 0.0001
     LEAF_REG_WEIGHT: 0
-    TWIST_WEIGHT: 0.01
+    TWIST_WEIGHT: 1
     THETA_WEIGHT: 0.01
     UVD24_WEIGHT: 1
     XYZ24_WEIGHT: 1
diff --git a/configs/256x192_adam_lr1e-3-res34_smpl_3d_cam_2x_mix_w_pw3d.yaml b/configs/256x192_adam_lr1e-3-res34_smpl_3d_cam_2x_mix_w_pw3d.yaml
index 17b4919..6f13647 100644
--- a/configs/256x192_adam_lr1e-3-res34_smpl_3d_cam_2x_mix_w_pw3d.yaml
+++ b/configs/256x192_adam_lr1e-3-res34_smpl_3d_cam_2x_mix_w_pw3d.yaml
@@ -47,13 +47,13 @@ MODEL:
     - 2200
     - 2200
 LOSS:
-  TYPE: 'L1LossDimSMPLCam'
+  TYPE: 'LaplaceLossDimSMPLCam'
   ELEMENTS:
     BETA_WEIGHT: 1
     BETA_REG_WEIGHT: 0
     PHI_REG_WEIGHT: 0.0001
     LEAF_REG_WEIGHT: 0
-    TWIST_WEIGHT: 0.01
+    TWIST_WEIGHT: 1
     THETA_WEIGHT: 0.01
     UVD24_WEIGHT: 1
     XYZ24_WEIGHT: 0
diff --git a/configs/256x192_adam_lr1e-3-res50_reg_smpl_3d_cam_2x_mix_w_pw3d.yaml b/configs/256x192_adam_lr1e-3-res50_reg_smpl_3d_cam_2x_mix_w_pw3d.yaml
index 8d388c2..696e987 100644
--- a/configs/256x192_adam_lr1e-3-res50_reg_smpl_3d_cam_2x_mix_w_pw3d.yaml
+++ b/configs/256x192_adam_lr1e-3-res50_reg_smpl_3d_cam_2x_mix_w_pw3d.yaml
@@ -53,7 +53,7 @@ LOSS:
     BETA_REG_WEIGHT: 0
     PHI_REG_WEIGHT: 0.0001
     LEAF_REG_WEIGHT: 0
-    TWIST_WEIGHT: 0.01
+    TWIST_WEIGHT: 1
     THETA_WEIGHT: 0.01
     UVD24_WEIGHT: 1
     XYZ24_WEIGHT: 0
diff --git a/hybrik/datasets/cocoeft.py b/hybrik/datasets/cocoeft.py
new file mode 100644
index 0000000..1ae10c5
--- /dev/null
+++ b/hybrik/datasets/cocoeft.py
@@ -0,0 +1,235 @@
+"""MS COCO Human keypoint dataset."""
+import os
+
+# import scipy.misc
+import cv2
+import joblib
+import numpy as np
+import torch
+import torch.utils.data as data
+from hybrik.utils.presets.simple_transform_3d_cam_eft import SimpleTransform3DCamEFT
+from pytorch3d.transforms.rotation_conversions import matrix_to_axis_angle
+
+s_coco_2_smpl_jt = [
+    -1, 11, 12,
+    -1, 13, 14,
+    -1, 15, 16,
+    -1, -1, -1,
+    -1, -1, -1,
+    -1,
+    5, 6,
+    7, 8,
+    9, 10,
+    -1, -1
+]
+
+
+class COCO_EFT_3D(data.Dataset):
+    """ COCO Person dataset.
+    Parameters
+    ----------
+    ann_file: str,
+        Path to the annotation json file.
+    root: str, default './data/coco'
+        Path to the ms coco dataset.
+    train: bool, default is True
+        If true, will set as training mode.
+    skip_empty: bool, default is False
+        Whether skip entire image if no valid label is found. Use `False` if this dataset is
+        for validation to avoid COCO metric error.
+    """
+    CLASSES = ['person']
+    num_joints = 17
+    EVAL_JOINTS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+    joints_name = ('nose', 'left_eye', 'right_eye', 'left_ear', 'right_ear',    # 4
+                   'left_shoulder', 'right_shoulder',                           # 6
+                   'left_elbow', 'right_elbow',                                 # 8
+                   'left_wrist', 'right_wrist',                                 # 10
+                   'left_hip', 'right_hip',                                     # 12
+                   'left_knee', 'right_knee',                                   # 14
+                   'left_ankle', 'right_ankle')                                 # 16
+
+    def __init__(self,
+                 cfg,
+                 ann_file,
+                 root='./data/coco',
+                 train=True,
+                 skip_empty=True,
+                 dpg=False,
+                 lazy_import=False):
+
+        self._cfg = cfg
+        self._ann_file = os.path.join(root, 'annotations', ann_file)
+        self._lazy_import = lazy_import
+        self._root = root
+        self._skip_empty = skip_empty
+        self._train = train
+        self._dpg = dpg
+
+        self._scale_factor = cfg.DATASET.SCALE_FACTOR
+        self._color_factor = cfg.DATASET.COLOR_FACTOR
+        self._rot = cfg.DATASET.ROT_FACTOR
+        self._input_size = cfg.MODEL.IMAGE_SIZE
+        self._output_size = cfg.MODEL.HEATMAP_SIZE
+
+        self._occlusion = cfg.DATASET.OCCLUSION
+
+        self._crop = cfg.MODEL.EXTRA.CROP
+        self._sigma = cfg.MODEL.EXTRA.SIGMA
+
+        self._check_centers = False
+
+        self.num_class = len(self.CLASSES)
+
+        self.num_joints_half_body = cfg.DATASET.NUM_JOINTS_HALF_BODY
+        self.prob_half_body = cfg.DATASET.PROB_HALF_BODY
+
+        self.augment = cfg.MODEL.EXTRA.AUGMENT
+
+        self._loss_type = cfg.LOSS['TYPE']
+
+        self.upper_body_ids = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+        self.lower_body_ids = (11, 12, 13, 14, 15, 16)
+
+        bbox_3d_shape = getattr(cfg.MODEL, 'BBOX_3D_SHAPE', [2200, 2200, 2200])
+        # millimeter -> meter
+        self.bbox_3d_shape = [item * 1e-3 for item in bbox_3d_shape]
+
+        self.transformation = SimpleTransform3DCamEFT(
+            self, scale_factor=self._scale_factor,
+            color_factor=self._color_factor,
+            occlusion=self._occlusion,
+            input_size=self._input_size,
+            output_size=self._output_size,
+            depth_dim=64,
+            bbox_3d_shape=self.bbox_3d_shape,
+            rot=self._rot, sigma=self._sigma,
+            train=self._train, add_dpg=self._dpg,
+            loss_type=self._loss_type, scale_mult=1.25)
+
+        self.db = self.load_pt()
+
+    def __getitem__(self, idx):
+        # get image id
+        img_path = self.db['img_path'][idx]
+        img_id = int(os.path.splitext(os.path.basename(img_path))[0])
+
+        # load ground truth, including bbox, keypoints, image size
+        label = {}
+        for k in self.db.keys():
+            try:
+                label[k] = self.db[k][idx].copy()
+            except AttributeError:
+                label[k] = self.db[k][idx]
+
+        label_new = self.preprocess_pt_item(label, idx)
+        # img = scipy.misc.imread(img_path, mode='RGB')
+        src = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)
+        # transform ground truth into training label and apply data augmentation
+        target = self.transformation(src, label_new)
+
+        img = target.pop('image')
+        bbox = target.pop('bbox')
+
+        return img, target, img_id, bbox
+
+    def __len__(self):
+        return len(self.db['img_path'])
+
+    def load_pt(self):
+        db = joblib.load(self._ann_file + '_smpl_annot.pt', 'r')
+        return db
+
+    @property
+    def joint_pairs(self):
+        """Joint pairs which defines the pairs of joint to be swapped
+        when the image is flipped horizontally."""
+        return [[1, 2], [3, 4], [5, 6], [7, 8],
+                [9, 10], [11, 12], [13, 14], [15, 16]]
+
+    def _get_box_center_area(self, bbox):
+        """Get bbox center"""
+        c = np.array([(bbox[0] + bbox[2]) / 2.0, (bbox[1] + bbox[3]) / 2.0])
+        area = (bbox[3] - bbox[1]) * (bbox[2] - bbox[0])
+        return c, area
+
+    def _get_keypoints_center_count(self, keypoints):
+        """Get geometric center of all keypoints"""
+        keypoint_x = np.sum(keypoints[:, 0, 0] * (keypoints[:, 0, 1] > 0))
+        keypoint_y = np.sum(keypoints[:, 1, 0] * (keypoints[:, 1, 1] > 0))
+        num = float(np.sum(keypoints[:, 0, 1]))
+        return np.array([keypoint_x / num, keypoint_y / num]), num
+
+    def preprocess_pt_item(self, label, idx):
+
+        # for k, v in label.items():
+        #     print(k)
+        beta = label['shape'].copy()
+        theta = label['pose'].copy().reshape(24, 3, 3)
+        theta = matrix_to_axis_angle(torch.from_numpy(theta)).numpy()
+        # scalar
+        smpl_weight = label['smpl_weight'].copy().reshape(-1)
+
+        joint_cam_17 = label['xyz_17'].reshape((17, 3))
+        joint_cam_17 = joint_cam_17 - joint_cam_17[0]
+        joint_cam_29 = label['xyz_29'].reshape((29, 3))
+        joint_cam_29 = joint_cam_29 - joint_cam_29[0]
+
+        joint_img_17 = np.zeros((17, 3))
+        joints_vis_17 = np.zeros((17, 3)) * smpl_weight
+        joint_img_29 = np.zeros((29, 3))
+        joints_vis_29 = np.ones((29, 3)) * smpl_weight
+        joints_vis_xyz_29 = np.ones((29, 3)) * smpl_weight
+        gt_joints = label['joints_3d']
+
+        # if smpl_weight[0] < 0.5:
+        if float(smpl_weight) < 0.5:
+            for i in range(24):
+                id1 = i
+                id2 = s_coco_2_smpl_jt[i]
+                if id2 >= 0:
+                    joint_img_29[id1, :2] = gt_joints[id2, :2, 0].copy()
+                    joints_vis_29[id1, :2] = gt_joints[id2, :2, 1].copy()
+        else:
+            uv_29 = label['uv_29']
+            joint_img_29[:, :2] = uv_29
+            joint_img_29[:, 2] = joint_cam_29[:, 2]
+
+        twist_angle = label['twist_angle'].reshape(23)
+        cos = np.cos(twist_angle)
+        sin = np.sin(twist_angle)
+        phi = np.stack((cos, sin), axis=1)
+        phi_weight = np.ones_like(phi) * smpl_weight[0]
+
+        flag = (twist_angle < -10)
+        phi_weight[flag, :] = 0
+
+        root_cam = joint_cam_29[0]
+
+        f = np.array([1000.0, 1000.0])
+        c = np.array([128.0, 128.0])
+
+        return_label = {
+            'bbox': label['bbox'],
+            'img_id': idx,
+            'img_path': label['img_path'],
+            'img_name': label['img_path'],
+            'joint_img_17': joint_img_17,
+            'joint_vis_17': joints_vis_17,
+            'joint_cam_17': joint_cam_17,
+            'joint_relative_17': joint_cam_17,
+            'joint_img_29': joint_img_29,
+            'joint_vis_29': joints_vis_29,
+            'joint_vis_xyz_29': joints_vis_xyz_29,
+            'joint_cam_29': joint_cam_29,
+            'twist_phi': phi,
+            'twist_weight': phi_weight,
+            'beta': beta,
+            'theta': theta,
+            'root_cam': root_cam,
+            'f': f,
+            'c': c,
+            'smpl_weight': smpl_weight
+        }
+
+        return return_label
diff --git a/hybrik/datasets/mix_dataset2_cam.py b/hybrik/datasets/mix_dataset2_cam.py
index a3ebe95..68a0cd1 100644
--- a/hybrik/datasets/mix_dataset2_cam.py
+++ b/hybrik/datasets/mix_dataset2_cam.py
@@ -32,7 +32,7 @@
     9, 14,
     10, 15,
     11, 16,
-    -1, -1, # 23
+    -1, -1,  # 23
     # 7, 
     # -1, -1,
     # 21, 26
@@ -253,10 +253,12 @@ def __getitem__(self, idx):
             target['target_weight_29'] = label_uvd_29_mask
             target['target_xyz_17'] = label_xyz_17
             target['target_weight_17'] = label_xyz_17_mask
-            target['target_theta'] = torch.zeros(24 * 4)
+            # target['target_theta'] = torch.zeros(24 * 4)
+            target['target_theta'] = torch.zeros(24 * 9)
             target['target_beta'] = torch.zeros(10)
             target['target_smpl_weight'] = torch.zeros(1)
-            target['target_theta_weight'] = torch.zeros(24 * 4)
+            # target['target_theta_weight'] = torch.zeros(24 * 4)
+            target['target_theta_weight'] = torch.zeros(24 * 9)
             target['target_twist'] = torch.zeros(23, 2)
             target['target_twist_weight'] = torch.zeros(23, 2)
             target['target_xyz_weight_24'] = label_xyz_24_mask
diff --git a/hybrik/datasets/mix_dataset_cam.py b/hybrik/datasets/mix_dataset_cam.py
index 0bff3a6..75af4c4 100644
--- a/hybrik/datasets/mix_dataset_cam.py
+++ b/hybrik/datasets/mix_dataset_cam.py
@@ -260,10 +260,12 @@ def __getitem__(self, idx):
             target['target_weight_29'] = label_uvd_29_mask
             target['target_xyz_17'] = label_xyz_17
             target['target_weight_17'] = label_xyz_17_mask
-            target['target_theta'] = torch.zeros(24 * 4)
+            # target['target_theta'] = torch.zeros(24 * 4)
+            target['target_theta'] = torch.zeros(24 * 9)
             target['target_beta'] = torch.zeros(10)
             target['target_smpl_weight'] = torch.zeros(1)
-            target['target_theta_weight'] = torch.zeros(24 * 4)
+            # target['target_theta_weight'] = torch.zeros(24 * 4)
+            target['target_theta_weight'] = torch.zeros(24 * 9)
             target['target_twist'] = torch.zeros(23, 2)
             target['target_twist_weight'] = torch.zeros(23, 2)
             target['target_xyz_weight_24'] = label_xyz_24_mask
diff --git a/hybrik/models/HRNetWithCam.py b/hybrik/models/HRNetWithCam.py
index 709c028..6158850 100644
--- a/hybrik/models/HRNetWithCam.py
+++ b/hybrik/models/HRNetWithCam.py
@@ -332,7 +332,7 @@ def forward(self, x, flip_test=False, **kwargs):
         pred_xyz_jts_24_struct = output.joints.float() / self.depth_factor
         #  -0.5 ~ 0.5
         pred_xyz_jts_17 = output.joints_from_verts.float() / self.depth_factor
-        pred_theta_mats = output.rot_mats.float().reshape(batch_size, 24 * 4)
+        pred_theta_mats = output.rot_mats.float().reshape(batch_size, 24 * 9)
         pred_xyz_jts_24 = pred_xyz_jts_29[:, :24, :].reshape(batch_size, 72)
         pred_xyz_jts_24_struct = pred_xyz_jts_24_struct.reshape(batch_size, 72)
         pred_xyz_jts_17_flat = pred_xyz_jts_17.reshape(batch_size, 17 * 3)
diff --git a/hybrik/models/HRNetWithCamReg.py b/hybrik/models/HRNetWithCamReg.py
index 1aea466..de3db7c 100644
--- a/hybrik/models/HRNetWithCamReg.py
+++ b/hybrik/models/HRNetWithCamReg.py
@@ -351,7 +351,7 @@ def forward(self, x, flip_test=False, **kwargs):
         pred_xyz_jts_24_struct = output.joints.float() / self.depth_factor
         #  -0.5 ~ 0.5
         pred_xyz_jts_17 = output.joints_from_verts.float() / self.depth_factor
-        pred_theta_mats = output.rot_mats.float().reshape(batch_size, 24 * 4)
+        pred_theta_mats = output.rot_mats.float().reshape(batch_size, 24 * 9)
         pred_xyz_jts_24 = pred_xyz_jts_29[:, :24, :].reshape(batch_size, 72)
         pred_xyz_jts_24_struct = pred_xyz_jts_24_struct.reshape(batch_size, 72)
         pred_xyz_jts_17_flat = pred_xyz_jts_17.reshape(batch_size, 17 * 3)
diff --git a/hybrik/models/criterion.py b/hybrik/models/criterion.py
index 4ee6844..327e2c4 100644
--- a/hybrik/models/criterion.py
+++ b/hybrik/models/criterion.py
@@ -22,7 +22,7 @@ def weighted_l1_loss(input, target, weights, size_average):
 def weighted_laplace_loss(input, sigma, target, weights, size_average):
     input = input
     target = target
-    out = torch.log(sigma / amp) + torch.abs(input - target) / (math.sqrt(2) * sigma + 1e-9)
+    out = torch.log(sigma / amp) + torch.abs(input - target) / (math.sqrt(2) * sigma + 1e-5)
     out = out * weights
     if size_average and weights.sum() > 0:
         return out.sum() / weights.sum()
diff --git a/hybrik/models/layers/smpl/SMPL.py b/hybrik/models/layers/smpl/SMPL.py
index 6fc8864..6095945 100644
--- a/hybrik/models/layers/smpl/SMPL.py
+++ b/hybrik/models/layers/smpl/SMPL.py
@@ -261,7 +261,7 @@ def hybrik(self,
             leaf_thetas=leaf_thetas)
 
         rot_mats = rot_mats.reshape(batch_size * 24, 3, 3)
-        rot_mats = rotmat_to_quat(rot_mats).reshape(batch_size, 24 * 4)
+        # rot_mats = rotmat_to_quat(rot_mats).reshape(batch_size, 24 * 4)
 
         if transl is not None:
             new_joints += transl.unsqueeze(dim=1)
diff --git a/hybrik/models/layers/smpl/lbs.py b/hybrik/models/layers/smpl/lbs.py
index bf45101..9c13fa5 100644
--- a/hybrik/models/layers/smpl/lbs.py
+++ b/hybrik/models/layers/smpl/lbs.py
@@ -991,6 +991,12 @@ def batch_get_pelvis_orient(rel_pose_skeleton, rel_rest_pose, parents, children,
     spine_norm = torch.norm(spine_final_loc, dim=1, keepdim=True)
     spine_norm = spine_final_loc / (spine_norm + 1e-8)
 
+    assert torch.sum(torch.isnan(spine_rest_loc)
+                     ) == 0, ('spine_rest_loc', spine_rest_loc)
+
+    assert torch.sum(torch.isnan(spine_final_loc)
+                     ) == 0, ('spine_final_loc', spine_final_loc)
+
     rot_mat_spine = vectors2rotmat(spine_rest_loc, spine_final_loc, dtype)
 
     assert torch.sum(torch.isnan(rot_mat_spine)
diff --git a/hybrik/models/simple3dposeSMPLWithCam.py b/hybrik/models/simple3dposeSMPLWithCam.py
index d4aa633..1839f98 100644
--- a/hybrik/models/simple3dposeSMPLWithCam.py
+++ b/hybrik/models/simple3dposeSMPLWithCam.py
@@ -98,8 +98,8 @@ def __init__(self, norm_layer=nn.BatchNorm2d, **kwargs):
         init_cam = torch.tensor([0.9, 0, 0])
         self.register_buffer(
             'init_cam',
-            torch.Tensor(init_cam).float()) 
-    
+            torch.Tensor(init_cam).float())
+
         self.avg_pool = nn.AdaptiveAvgPool2d(1)
         self.fc1 = nn.Linear(self.feature_channel, 1024)
         self.drop1 = nn.Dropout(p=0.5)
@@ -108,6 +108,7 @@ def __init__(self, norm_layer=nn.BatchNorm2d, **kwargs):
         self.decshape = nn.Linear(1024, 10)
         self.decphi = nn.Linear(1024, 23 * 2)  # [cos(phi), sin(phi)]
         self.deccam = nn.Linear(1024, 3)
+        self.decsigma = nn.Linear(1024, 29)
 
         self.focal_length = kwargs['FOCAL_LENGTH']
         self.bbox_3d_shape = kwargs['BBOX_3D_SHAPE'] if 'BBOX_3D_SHAPE' in kwargs else (2000, 2000, 2000)
@@ -238,7 +239,7 @@ def forward(self, x, flip_test=False, **kwargs):
         x0 = self.avg_pool(x0)
         x0 = x0.view(x0.size(0), -1)
         init_shape = self.init_shape.expand(batch_size, -1)     # (B, 10,)
-        init_cam = self.init_cam.expand(batch_size, -1) # (B, 3,)
+        init_cam = self.init_cam.expand(batch_size, -1)  # (B, 3,)
 
         xc = x0
 
@@ -251,6 +252,7 @@ def forward(self, x, flip_test=False, **kwargs):
         pred_shape = delta_shape + init_shape
         pred_phi = self.decphi(xc)
         pred_camera = self.deccam(xc).reshape(batch_size, -1) + init_cam
+        sigma = self.decsigma(xc).reshape(batch_size, 29, 1).sigmoid()
 
         pred_phi = pred_phi.reshape(batch_size, 23, 2)
 
@@ -267,6 +269,8 @@ def forward(self, x, flip_test=False, **kwargs):
             flip_pred_shape = flip_delta_shape + init_shape
             flip_pred_phi = self.decphi(flip_xc)
             flip_pred_camera = self.deccam(flip_xc).reshape(batch_size, -1) + init_cam
+            flip_sigma = self.decsigma(flip_x0).reshape(
+                batch_size, 29, 1).sigmoid()
 
             pred_shape = (pred_shape + flip_pred_shape) / 2
 
@@ -277,6 +281,9 @@ def forward(self, x, flip_test=False, **kwargs):
             flip_pred_camera[:, 1] = -flip_pred_camera[:, 1]
             pred_camera = (pred_camera + flip_pred_camera) / 2
 
+            flip_sigma = self.flip_sigma(flip_sigma)
+            sigma = (sigma + flip_sigma) / 2
+
         camScale = pred_camera[:, :1].unsqueeze(1)
         camTrans = pred_camera[:, 1:].unsqueeze(1)
 
@@ -309,8 +316,7 @@ def forward(self, x, flip_test=False, **kwargs):
             camera_root[:, 2] += camDepth[:, 0, 0]
         else:
             pred_xyz_jts_29[:, :, 2:] = pred_uvd_jts_29[:, :, 2:].clone()  # unit: (self.depth_factor m)
-            pred_xyz_jts_29_meter = (pred_uvd_jts_29[:, :, :2] * self.input_size / self.focal_length) \
-                                            * (pred_xyz_jts_29[:, :, 2:]*self.depth_factor + camDepth) - camTrans  # unit: m
+            pred_xyz_jts_29_meter = (pred_uvd_jts_29[:, :, :2] * self.input_size / self.focal_length) * (pred_xyz_jts_29[:, :, 2:] * self.depth_factor + camDepth) - camTrans  # unit: m
 
             pred_xyz_jts_29[:, :, :2] = pred_xyz_jts_29_meter / self.depth_factor  # unit: (self.depth_factor m)
 
@@ -322,7 +328,7 @@ def forward(self, x, flip_test=False, **kwargs):
         pred_xyz_jts_29_flat = pred_xyz_jts_29.reshape(batch_size, -1)
 
         output = self.smpl.hybrik(
-            pose_skeleton=pred_xyz_jts_29.type(self.smpl_dtype) * self.depth_factor, # unit: meter
+            pose_skeleton=pred_xyz_jts_29.type(self.smpl_dtype) * self.depth_factor,  # unit: meter
             betas=pred_shape.type(self.smpl_dtype),
             phis=pred_phi.type(self.smpl_dtype),
             global_orient=None,
@@ -333,7 +339,7 @@ def forward(self, x, flip_test=False, **kwargs):
         pred_xyz_jts_24_struct = output.joints.float() / self.depth_factor
         #  -0.5 ~ 0.5
         pred_xyz_jts_17 = output.joints_from_verts.float() / self.depth_factor
-        pred_theta_mats = output.rot_mats.float().reshape(batch_size, 24 * 4)
+        pred_theta_mats = output.rot_mats.float().reshape(batch_size, 24 * 9)
         pred_xyz_jts_24 = pred_xyz_jts_29[:, :24, :].reshape(batch_size, 72)
         pred_xyz_jts_24_struct = pred_xyz_jts_24_struct.reshape(batch_size, 72)
         pred_xyz_jts_17_flat = pred_xyz_jts_17.reshape(batch_size, 17 * 3)
@@ -356,6 +362,8 @@ def forward(self, x, flip_test=False, **kwargs):
             cam_trans=camTrans[:, 0],
             cam_root=camera_root,
             transl=transl,
+            pred_sigma=sigma,
+            scores=1 - sigma,
             # uvd_heatmap=torch.stack([hm_x0, hm_y0, hm_z0], dim=2),
             # uvd_heatmap=heatmaps,
             # img_feat=x0
diff --git a/hybrik/models/simple3dposeSMPLWithCamReg.py b/hybrik/models/simple3dposeSMPLWithCamReg.py
index 9ed4543..f59c0aa 100644
--- a/hybrik/models/simple3dposeSMPLWithCamReg.py
+++ b/hybrik/models/simple3dposeSMPLWithCamReg.py
@@ -256,7 +256,7 @@ def forward(self, x, flip_test=False, **kwargs):
         pred_xyz_jts_24_struct = output.joints.float() / self.depth_factor
         #  -0.5 ~ 0.5
         pred_xyz_jts_17 = output.joints_from_verts.float() / self.depth_factor
-        pred_theta_mats = output.rot_mats.float().reshape(batch_size, 24 * 4)
+        pred_theta_mats = output.rot_mats.float().reshape(batch_size, 24 * 9)
         pred_xyz_jts_24 = pred_xyz_jts_29[:, :24, :].reshape(batch_size, 72)
         pred_xyz_jts_24_struct = pred_xyz_jts_24_struct.reshape(batch_size, 72)
         pred_xyz_jts_17_flat = pred_xyz_jts_17.reshape(batch_size, 17 * 3)
diff --git a/hybrik/utils/presets/simple_transform_3d_cam_eft.py b/hybrik/utils/presets/simple_transform_3d_cam_eft.py
new file mode 100644
index 0000000..7053b0b
--- /dev/null
+++ b/hybrik/utils/presets/simple_transform_3d_cam_eft.py
@@ -0,0 +1,592 @@
+import math
+import random
+
+import cv2
+import numpy as np
+import torch
+
+from ..bbox import _box_to_center_scale, _center_scale_to_box
+from ..transforms import (addDPG, affine_transform, flip_joints_3d, flip_thetas, flip_xyz_joints_3d,
+                          get_affine_transform, im_to_torch, batch_rodrigues_numpy, flip_twist,
+                          rotmat_to_quat_numpy, rotate_xyz_jts, rot_aa, flip_cam_xyz_joints_3d)
+from ..pose_utils import get_intrinsic_metrix
+
+s_coco_2_smpl_jt = [
+    -1, 11, 12,
+    -1, 13, 14,
+    -1, 15, 16,
+    -1, -1, -1,
+    -1, -1, -1,
+    -1,
+    5, 6,
+    7, 8,
+    9, 10,
+    -1, -1
+]
+
+smpl_parents = [-1, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 12, 13, 14,
+                16, 17, 18, 19, 20, 21]
+
+
+left_bones_idx = [
+    (0, 1), (1, 4), (4, 7), (12, 13),
+    (13, 16), (16, 18), (18, 20)
+]
+
+right_bones_idx = [
+    (0, 2), (2, 5), (5, 8), (12, 14),
+    (14, 17), (17, 19), (19, 21)
+]
+
+skeleton_29 = [ 
+    (0, 1), (0, 2), (0, 3), (1, 4), (2, 5), (3, 6), # 5
+    (4, 7), (5, 8), (6, 9), (7, 10), (8, 11), (9, 12), # 11
+    (9, 13), (9, 14), (12, 15), (13, 16), (14, 17), (16, 18), # 17
+    (17, 19), (18, 20), (19, 21), (20, 22), (21, 23), (15, 24), # 23
+    (22, 25), (23, 26), (10, 27), (11, 28) # 27
+]
+
+skeleton_3dhp = np.array([(-1, -1)] * 28).astype(int)
+skeleton_3dhp[ [6, 7, 17, 18, 19, 20] ] = np.array([
+        (19, 20), (24, 25), (9, 10), (14, 15), (10, 11), (15, 16)
+    ]).astype(int)
+
+
+class SimpleTransform3DCamEFT(object):
+    """Generation of cropped input person, pose coords, smpl parameters.
+    Parameters
+    ----------
+    img: torch.Tensor
+        A tensor with shape: `(3, h, w)`.
+    label: dict
+        A dictionary with 4 keys:
+            `bbox`: [xmin, ymin, xmax, ymax]
+            `joints_3d`: numpy.ndarray with shape: (n_joints, 2),
+                    including position and visible flag
+            `width`: image width
+            `height`: image height
+    dataset:
+        The dataset to be transformed, must include `joint_pairs` property for flipping.
+    scale_factor: int
+        Scale augmentation.
+    input_size: tuple
+        Input image size, as (height, width).
+    output_size: tuple
+        Heatmap size, as (height, width).
+    rot: int
+        Ratation augmentation.
+    train: bool
+        True for training trasformation.
+    """
+
+    def __init__(self, dataset, scale_factor, color_factor, occlusion, add_dpg,
+                 input_size, output_size, depth_dim, bbox_3d_shape,
+                 rot, sigma, train, loss_type='MSELoss', scale_mult=1.25, focal_length=1000, two_d=False,
+                 root_idx=0):
+
+        self._joint_pairs_17 = ((1, 4), (2, 5), (3, 6), (11, 14), (12, 15), (13, 16))
+        self._joint_pairs_24 = ((1, 2), (4, 5), (7, 8), (10, 11), (13, 14), (16, 17), (18, 19), (20, 21), (22, 23))
+        self._joint_pairs_29 = ((1, 2), (4, 5), (7, 8), (10, 11), (13, 14), (16, 17), (18, 19), (20, 21), (22, 23), (25, 26), (27, 28))
+
+        self._scale_factor = scale_factor
+        self._color_factor = color_factor
+        self._occlusion = occlusion
+        self._rot = rot
+        self._add_dpg = add_dpg
+
+        self._input_size = input_size
+        self._heatmap_size = output_size
+
+        self._sigma = sigma
+        self._train = train
+        self._loss_type = loss_type
+        self._aspect_ratio = float(input_size[1]) / input_size[0]  # w / h
+        self._feat_stride = np.array(input_size) / np.array(output_size)
+
+        self.pixel_std = 1
+
+        self.bbox_3d_shape = dataset.bbox_3d_shape
+        self._scale_mult = scale_mult
+        self.two_d = two_d
+
+        # convert to unit: meter
+        self.depth_factor2meter = self.bbox_3d_shape[2] if self.bbox_3d_shape[2] < 50 else self.bbox_3d_shape[2]*1e-3
+
+        self.focal_length = focal_length
+        self.root_idx = root_idx
+
+        if train:
+            self.num_joints_half_body = dataset.num_joints_half_body
+            self.prob_half_body = dataset.prob_half_body
+
+            self.upper_body_ids = dataset.upper_body_ids
+            self.lower_body_ids = dataset.lower_body_ids
+
+    def test_transform(self, src, bbox):
+        xmin, ymin, xmax, ymax = bbox
+        center, scale = _box_to_center_scale(
+            xmin, ymin, xmax - xmin, ymax - ymin, self._aspect_ratio, scale_mult=self._scale_mult)
+        scale = scale * 1.0
+
+        input_size = self._input_size
+        inp_h, inp_w = input_size
+        trans = get_affine_transform(center, scale, 0, [inp_w, inp_h])
+        img = cv2.warpAffine(src, trans, (int(inp_w), int(inp_h)), flags=cv2.INTER_LINEAR)
+        bbox = _center_scale_to_box(center, scale)
+
+        img = im_to_torch(img)
+        # mean
+        img[0].add_(-0.406)
+        img[1].add_(-0.457)
+        img[2].add_(-0.480)
+
+        # std
+        img[0].div_(0.225)
+        img[1].div_(0.224)
+        img[2].div_(0.229)
+
+        img_center = np.array([float(src.shape[1]) * 0.5, float(src.shape[0]) * 0.5])
+
+        return img, bbox, img_center
+
+    def _integral_target_generator(self, joints_3d, num_joints, patch_height, patch_width):
+        target_weight = np.ones((num_joints, 3), dtype=np.float32)
+        target_weight[:, 0] = joints_3d[:, 0, 1]
+        target_weight[:, 1] = joints_3d[:, 0, 1]
+        target_weight[:, 2] = joints_3d[:, 0, 1]
+
+        target = np.zeros((num_joints, 3), dtype=np.float32)
+        target[:, 0] = joints_3d[:, 0, 0] / patch_width - 0.5
+        target[:, 1] = joints_3d[:, 1, 0] / patch_height - 0.5
+        target[:, 2] = joints_3d[:, 2, 0] / self.bbox_3d_shape[0]
+
+        target_weight[target[:, 0] > 0.5] = 0
+        target_weight[target[:, 0] < -0.5] = 0
+        target_weight[target[:, 1] > 0.5] = 0
+        target_weight[target[:, 1] < -0.5] = 0
+        target_weight[target[:, 2] > 0.5] = 0
+        target_weight[target[:, 2] < -0.5] = 0
+
+        target = target.reshape((-1))
+        target_weight = target_weight.reshape((-1))
+        return target, target_weight
+
+    def _integral_uvd_target_generator(self, joints_3d, num_joints, patch_height, patch_width):
+
+        target_weight = np.ones((num_joints, 3), dtype=np.float32)
+        target_weight[:, 0] = joints_3d[:, 0, 1]
+        target_weight[:, 1] = joints_3d[:, 0, 1]
+        target_weight[:, 2] = joints_3d[:, 0, 1]
+
+        target = np.zeros((num_joints, 3), dtype=np.float32)
+        target[:, 0] = joints_3d[:, 0, 0] / patch_width - 0.5
+        target[:, 1] = joints_3d[:, 1, 0] / patch_height - 0.5
+        target[:, 2] = joints_3d[:, 2, 0] / self.bbox_3d_shape[2]
+
+        target_weight[target[:, 0] > 0.5] = 0
+        target_weight[target[:, 0] < -0.5] = 0
+        target_weight[target[:, 1] > 0.5] = 0
+        target_weight[target[:, 1] < -0.5] = 0
+        target_weight[target[:, 2] > 0.5] = 0
+        target_weight[target[:, 2] < -0.5] = 0
+
+        target = target.reshape((-1))
+        target_weight = target_weight.reshape((-1))
+        return target, target_weight
+
+    def _integral_xyz_target_generator(self, joints_3d, joints_3d_vis, num_joints):
+        target_weight = np.ones((num_joints, 3), dtype=np.float32)
+        target_weight[:, 0] = joints_3d_vis[:, 0]
+        target_weight[:, 1] = joints_3d_vis[:, 1]
+        target_weight[:, 2] = joints_3d_vis[:, 2]
+
+        target = np.zeros((num_joints, 3), dtype=np.float32)
+        target[:, 0] = joints_3d[:, 0] / self.bbox_3d_shape[0]
+        target[:, 1] = joints_3d[:, 1] / self.bbox_3d_shape[1]
+        target[:, 2] = joints_3d[:, 2] / self.bbox_3d_shape[2]
+
+        # if self.bbox_3d_shape[0] < 1000:
+        #     print(self.bbox_3d_shape, target)
+
+        # assert (target[0] == 0).all(), f'{target}, {self.bbox_3d_shape}'
+
+        target = target.reshape((-1))
+        target_weight = target_weight.reshape((-1))
+        return target, target_weight
+
+    def __call__(self, src, label):
+        if self.two_d:
+            assert NotImplementedError
+        else:
+            bbox = list(label['bbox'])
+            joint_img_17 = label['joint_img_17'].copy()
+            joint_relative_17 = label['joint_relative_17'].copy()
+            joint_cam_17 = label['joint_cam_17'].copy()
+            joints_vis_17 = label['joint_vis_17'].copy()
+            joint_img_29 = label['joint_img_29'].copy()
+            joint_cam_29 = label['joint_cam_29'].copy()
+            joints_vis_29 = label['joint_vis_29'].copy()
+            joints_vis_xyz_29 = label['joint_vis_xyz_29'].copy()
+            smpl_weight = label['smpl_weight'].copy()
+            # root_cam = label['root_cam'].copy()
+            # root_depth = root_cam[2] / self.bbox_3d_shape[2]
+            self.num_joints = joint_img_29.shape[0]
+
+            beta = label['beta'].copy()
+            theta = label['theta'].copy()
+
+            beta_kid = label['beta_kid'].copy() if 'beta_kid' in label else np.zeros(1)
+
+            # assert not (theta < 1e-3).all(), label
+
+            if 'twist_phi' in label.keys():
+                twist_phi = label['twist_phi'].copy()
+                twist_weight = label['twist_weight'].copy()
+            else:
+                twist_phi = np.zeros((23, 2))
+                twist_weight = np.zeros((23, 2))
+
+            gt_joints_17 = np.zeros((17, 3, 2), dtype=np.float32)
+            gt_joints_17[:, :, 0] = joint_img_17.copy()
+            gt_joints_17[:, :, 1] = joints_vis_17.copy()
+            gt_joints_29 = np.zeros((29, 3, 2), dtype=np.float32)
+            gt_joints_29[:, :, 0] = joint_img_29.copy()
+            gt_joints_29[:, :, 1] = joints_vis_29.copy()
+
+            imgwidth, imght = src.shape[1], src.shape[0]
+
+            input_size = self._input_size
+
+            if self._add_dpg and self._train:
+                bbox = addDPG(bbox, imgwidth, imght)
+
+            xmin, ymin, xmax, ymax = bbox
+            center, scale = _box_to_center_scale(
+                xmin, ymin, xmax - xmin, ymax - ymin, self._aspect_ratio, scale_mult=self._scale_mult)
+
+            xmin, ymin, xmax, ymax = _center_scale_to_box(center, scale)
+
+            # half body transform
+            if self._train and (np.sum(joints_vis_17[:, 0]) > self.num_joints_half_body and np.random.rand() < self.prob_half_body):
+                c_half_body, s_half_body = self.half_body_transform(
+                    gt_joints_17[:, :, 0], joints_vis_17
+                )
+
+                if c_half_body is not None and s_half_body is not None:
+                    center, scale = c_half_body, s_half_body
+
+            # rescale
+            if self._train:
+                sf = self._scale_factor
+                scale = scale * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
+            else:
+                scale = scale * 1.0
+
+            # rotation
+            if self._train:
+                rf = self._rot
+                r = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) if random.random() <= 0.6 else 0
+            else:
+                r = 0
+
+            if self._train and self._occlusion:
+                while True:
+                    area_min = 0.0
+                    area_max = 0.3
+                    synth_area = (random.random() * (area_max - area_min) + area_min) * (xmax - xmin) * (ymax - ymin)
+
+                    ratio_min = 0.5
+                    ratio_max = 1 / 0.5
+                    synth_ratio = (random.random() * (ratio_max - ratio_min) + ratio_min)
+
+                    synth_h = math.sqrt(synth_area * synth_ratio)
+                    synth_w = math.sqrt(synth_area / synth_ratio)
+                    synth_xmin = random.random() * ((xmax - xmin) - synth_w - 1) + xmin
+                    synth_ymin = random.random() * ((ymax - ymin) - synth_h - 1) + ymin
+
+                    if synth_xmin >= 0 and synth_ymin >= 0 and synth_xmin + synth_w < imgwidth and synth_ymin + synth_h < imght:
+                        synth_xmin = int(synth_xmin)
+                        synth_ymin = int(synth_ymin)
+                        synth_w = int(synth_w)
+                        synth_h = int(synth_h)
+                        src[synth_ymin:synth_ymin + synth_h, synth_xmin:synth_xmin + synth_w, :] = np.random.rand(synth_h, synth_w, 3) * 255
+                        break
+
+            joints_17_uvd = gt_joints_17
+            joints_29_uvd = gt_joints_29
+
+            joint_cam_17_xyz = joint_cam_17
+            joints_cam_24_xyz = joint_cam_29[:24]
+
+            if random.random() > 0.75 and self._train:
+                assert src.shape[2] == 3
+                src = src[:, ::-1, :]
+
+                joints_17_uvd = flip_joints_3d(joints_17_uvd, imgwidth, self._joint_pairs_17)
+                joints_29_uvd = flip_joints_3d(joints_29_uvd, imgwidth, self._joint_pairs_29)
+                joint_cam_17_xyz = flip_cam_xyz_joints_3d(joint_cam_17_xyz, self._joint_pairs_17)
+                joints_cam_24_xyz = flip_cam_xyz_joints_3d(joints_cam_24_xyz, self._joint_pairs_24)
+                theta = flip_thetas(theta, self._joint_pairs_24)
+                twist_phi, twist_weight = flip_twist(twist_phi, twist_weight, self._joint_pairs_24)
+                center[0] = imgwidth - center[0] - 1
+
+            # rotate global theta
+            theta[0, :3] = rot_aa(theta[0, :3], r)
+
+            theta_rot_mat = batch_rodrigues_numpy(theta).reshape(24 * 9)
+            # theta_quat = rotmat_to_quat_numpy(theta_rot_mat).reshape(24 * 4)
+
+            # rotate xyz joints
+            joint_cam_17_xyz = rotate_xyz_jts(joint_cam_17_xyz, r)
+            joints_17_xyz = joint_cam_17_xyz - joint_cam_17_xyz[:1].copy()
+            joints_cam_24_xyz = rotate_xyz_jts(joints_cam_24_xyz, r)
+            joints_24_xyz = joints_cam_24_xyz - joints_cam_24_xyz[:1].copy()
+
+            inp_h, inp_w = input_size
+            trans = get_affine_transform(center, scale, r, [inp_w, inp_h])
+            trans_inv = get_affine_transform(center, scale, r, [inp_w, inp_h], inv=True).astype(np.float32)
+            intrinsic_param = get_intrinsic_metrix(label['f'], label['c'], inv=True).astype(np.float32) if 'f' in label.keys() else np.zeros((3, 3)).astype(np.float32)
+            joint_root = label['root_cam'].astype(np.float32) if 'root_cam' in label.keys() else np.zeros((3)).astype(np.float32)
+            depth_factor = np.array([self.bbox_3d_shape[2]]).astype(np.float32) if self.bbox_3d_shape else np.zeros((1)).astype(np.float32)
+
+            img = cv2.warpAffine(src, trans, (int(inp_w), int(inp_h)), flags=cv2.INTER_LINEAR)
+            # affine transform
+            for i in range(17):
+                if joints_17_uvd[i, 0, 1] > 0.0:
+                    joints_17_uvd[i, 0:2, 0] = affine_transform(joints_17_uvd[i, 0:2, 0], trans)
+
+            for i in range(29):
+                if joints_29_uvd[i, 0, 1] > 0.0:
+                    joints_29_uvd[i, 0:2, 0] = affine_transform(joints_29_uvd[i, 0:2, 0], trans)
+
+            target_smpl_weight = torch.ones(1).float() * smpl_weight
+            theta_24_weights = np.ones((24, 9)) * smpl_weight
+
+            theta_24_weights = theta_24_weights.reshape(24 * 9)
+
+            # generate training targets
+            target_uvd_29, target_weight_29 = self._integral_uvd_target_generator(joints_29_uvd, 29, inp_h, inp_w)
+            target_xyz_17, target_weight_17 = self._integral_xyz_target_generator(joints_17_xyz, joints_vis_17, 17)
+            target_xyz_24, target_weight_24 = self._integral_xyz_target_generator(joints_24_xyz, joints_vis_29[:24, :], 24)
+
+            target_weight_29 *= joints_vis_29.reshape(-1)
+            target_weight_24 *= joints_vis_xyz_29[:24, :].reshape(-1)
+            target_weight_17 *= joints_vis_17.reshape(-1)
+            bbox = _center_scale_to_box(center, scale)
+
+            tmp_uvd_24 = target_uvd_29.reshape(-1, 3)[:24]
+            tmp_uvd_24_weight = target_weight_29.reshape(-1, 3)[:24] * target_weight_24.reshape(-1, 3)
+
+            if self.focal_length > 0:
+                cam_scale, cam_trans, cam_valid, cam_error, new_uvd = self.calc_cam_scale_trans2(
+                    target_xyz_24.reshape(-1, 3).copy(),
+                    tmp_uvd_24.copy(),
+                    tmp_uvd_24_weight.copy())
+
+                # target_uvd_29 = (target_uvd_29 * target_weight_29).reshape(-1, 3)
+            else:
+                cam_scale = 1
+                cam_trans = np.zeros(2)
+                cam_valid = 0
+                cam_error = 0
+
+        assert img.shape[2] == 3
+        if self._train:
+            c_high = 1 + self._color_factor
+            c_low = 1 - self._color_factor
+            img[:, :, 0] = np.clip(img[:, :, 0] * random.uniform(c_low, c_high), 0, 255)
+            img[:, :, 1] = np.clip(img[:, :, 1] * random.uniform(c_low, c_high), 0, 255)
+            img[:, :, 2] = np.clip(img[:, :, 2] * random.uniform(c_low, c_high), 0, 255)
+
+        img = im_to_torch(img)
+        # mean
+        img[0].add_(-0.406)
+        img[1].add_(-0.457)
+        img[2].add_(-0.480)
+
+        # std
+        img[0].div_(0.225)
+        img[1].div_(0.224)
+        img[2].div_(0.229)
+
+        img_center = np.array([float(imgwidth) * 0.5, float(imght) * 0.5])
+
+        # target_weight_29 = target_weight_29.reshape(29, 3)
+        # target_weight_29[:, 2] = 0
+        # target_weight_29 = target_weight_29.reshape(-1)
+
+        output = {
+            'type': '3d_data_w_smpl',
+            'image': img,
+            # 'target_theta': torch.from_numpy(theta_quat).float(),
+            'target_theta': torch.from_numpy(theta_rot_mat).float(),
+            'target_theta_weight': torch.from_numpy(theta_24_weights).float(),
+            'target_beta': torch.from_numpy(beta).float(),
+            'target_smpl_weight': target_smpl_weight,
+            'target_uvd_29': torch.from_numpy(target_uvd_29.reshape(-1)).float(),
+            'target_xyz_24': torch.from_numpy(target_xyz_24).float(),
+            'target_weight_29': torch.from_numpy(target_weight_29).float(),
+            'target_weight_24': torch.from_numpy(target_weight_24).float(),
+            'target_xyz_17': torch.from_numpy(target_xyz_17).float(),
+            'target_weight_17': torch.from_numpy(target_weight_17).float(),
+            'target_xyz_weight_24': torch.from_numpy(target_weight_24).float(),
+            'trans_inv': torch.from_numpy(trans_inv).float(),
+            'intrinsic_param': torch.from_numpy(intrinsic_param).float(),
+            'joint_root': torch.from_numpy(joint_root).float(),
+            'depth_factor': torch.from_numpy(depth_factor).float(),
+            'bbox': torch.Tensor(bbox),
+            'target_twist': torch.from_numpy(twist_phi).float(),
+            'target_twist_weight': torch.from_numpy(twist_weight).float(),
+            'camera_scale': torch.from_numpy(np.array([cam_scale])).float(),
+            'camera_trans': torch.from_numpy(cam_trans).float(),
+            'camera_valid': cam_valid,
+            'camera_error': cam_error,
+            'img_center': torch.from_numpy(img_center).float(),
+            'target_beta_kid': torch.from_numpy(beta_kid).float(),
+        }
+
+        return output
+
+    def half_body_transform(self, joints, joints_vis):
+        upper_joints = []
+        lower_joints = []
+        for joint_id in range(self.num_joints):
+            if joints_vis[joint_id][0] > 0:
+                if joint_id in self.upper_body_ids:
+                    upper_joints.append(joints[joint_id])
+                else:
+                    lower_joints.append(joints[joint_id])
+
+        if np.random.randn() < 0.5 and len(upper_joints) > 2:
+            selected_joints = upper_joints
+        else:
+            selected_joints = lower_joints \
+                if len(lower_joints) > 2 else upper_joints
+
+        if len(selected_joints) < 2:
+            return None, None
+
+        selected_joints = np.array(selected_joints, dtype=np.float32)
+        center = selected_joints.mean(axis=0)[:2]
+
+        left_top = np.amin(selected_joints, axis=0)
+        right_bottom = np.amax(selected_joints, axis=0)
+
+        w = right_bottom[0] - left_top[0]
+        h = right_bottom[1] - left_top[1]
+
+        if w > self._aspect_ratio * h:
+            h = w * 1.0 / self._aspect_ratio
+        elif w < self._aspect_ratio * h:
+            w = h * self._aspect_ratio
+
+        scale = np.array(
+            [
+                w * 1.0 / self.pixel_std,
+                h * 1.0 / self.pixel_std
+            ],
+            dtype=np.float32
+        )
+
+        scale = scale * 1.5
+
+        return center, scale
+
+    def calc_cam_scale_trans2(self, xyz_29, uvd_29, uvd_weight):
+
+        f = self.focal_length
+
+        # unit: meter
+        # the equation to be solved: 
+        # u * 256 / f * (z + f/256 * 1/scale) = x + tx
+        # v * 256 / f * (z + f/256 * 1/scale) = y + ty
+
+        weight = (uvd_weight.sum(axis=-1, keepdims=True) >= 3.0) * 1.0  # 24 x 1
+        # assert weight.sum() >= 2, 'too few valid keypoints to calculate cam para'
+
+        if weight.sum() < 2:
+            # print('bad data')
+            return 0, np.zeros(2), 0.0, -1, uvd_29
+
+        xyz_29 = xyz_29 * self.depth_factor2meter  # convert to meter
+        new_uvd = uvd_29.copy()
+
+        num_joints = len(uvd_29)
+
+        Ax = np.zeros((num_joints, 3))
+        Ax[:, 1] = -1
+        Ax[:, 0] = uvd_29[:, 0]
+
+        Ay = np.zeros((num_joints, 3))
+        Ay[:, 2] = -1
+        Ay[:, 0] = uvd_29[:, 1]
+
+        Ax = Ax * weight
+        Ay = Ay * weight
+
+        A = np.concatenate([Ax, Ay], axis=0)
+
+        bx = (xyz_29[:, 0] - 256 * uvd_29[:, 0] / f * xyz_29[:, 2]) * weight[:, 0]
+        by = (xyz_29[:, 1] - 256 * uvd_29[:, 1] / f * xyz_29[:, 2]) * weight[:, 0]
+        b = np.concatenate([bx, by], axis=0)
+
+        A_s = np.dot(A.T, A)
+        b_s = np.dot(A.T, b)
+
+        cam_para = np.linalg.solve(A_s, b_s)
+
+        trans = cam_para[1:]
+        scale = 1.0 / cam_para[0]
+
+        target_camera = np.zeros(3)
+        target_camera[0] = scale
+        target_camera[1:] = trans
+
+        backed_projected_xyz = self.back_projection(uvd_29, target_camera, f)
+        backed_projected_xyz[:, 2] = backed_projected_xyz[:, 2] * self.depth_factor2meter
+        diff = np.sum((backed_projected_xyz - xyz_29)**2, axis=-1) * weight[:, 0]
+        diff = np.sqrt(diff).sum() / (weight.sum() + 1e-6) * 1000  # roughly mpjpe > 70
+        # print(scale, trans, diff)
+        if diff < 70:
+            new_uvd = self.projection(xyz_29, target_camera, f)
+            return scale, trans, 1.0, diff, new_uvd * uvd_weight
+        else:
+            return scale, trans, 0.0, diff, new_uvd
+
+    def projection(self, xyz, camera, f):
+        # xyz: unit: meter, u = f/256 * (x+dx) / (z+dz)
+        transl = camera[1:3]
+        scale = camera[0]
+        z_cam = xyz[:, 2:] + f / (256.0 * scale)  # J x 1
+        uvd = np.zeros_like(xyz)
+        uvd[:, 2] = xyz[:, 2] / self.bbox_3d_shape[2]
+        uvd[:, :2] = f / 256.0 * (xyz[:, :2] + transl) / z_cam
+        return uvd
+
+    def back_projection(self, uvd, pred_camera, focal_length=5000.):
+        camScale = pred_camera[:1].reshape(1, -1)
+        camTrans = pred_camera[1:].reshape(1, -1)
+
+        camDepth = focal_length / (256 * camScale)
+
+        pred_xyz = np.zeros_like(uvd)
+        pred_xyz[:, 2] = uvd[:, 2].copy()
+        pred_xyz[:, :2] = (uvd[:, :2] * 256 / focal_length) * (pred_xyz[:, 2:] * self.depth_factor2meter + camDepth) - camTrans
+
+        return pred_xyz
+
+
+def _box_to_center_scale_nosquare(x, y, w, h, aspect_ratio=1.0, scale_mult=1.5):
+    """Convert box coordinates to center and scale.
+    adapted from https://github.com/Microsoft/human-pose-estimation.pytorch
+    """
+    pixel_std = 1
+    center = np.zeros((2), dtype=np.float32)
+    center[0] = x + w * 0.5
+    center[1] = y + h * 0.5
+
+    scale = np.array(
+        [w * 1.0 / pixel_std, h * 1.0 / pixel_std], dtype=np.float32)
+    if center[0] != -1:
+        scale = scale * scale_mult
+    return center, scale
diff --git a/hybrik/utils/presets/simple_transform_3d_smpl_cam.py b/hybrik/utils/presets/simple_transform_3d_smpl_cam.py
index 9890be8..8f62dad 100644
--- a/hybrik/utils/presets/simple_transform_3d_smpl_cam.py
+++ b/hybrik/utils/presets/simple_transform_3d_smpl_cam.py
@@ -8,7 +8,7 @@
 from ..bbox import _box_to_center_scale, _center_scale_to_box
 from ..transforms import (addDPG, affine_transform, flip_joints_3d, flip_thetas, flip_xyz_joints_3d,
                           get_affine_transform, im_to_torch, batch_rodrigues_numpy, flip_twist,
-                          rotmat_to_quat_numpy, rotate_xyz_jts, rot_aa, flip_cam_xyz_joints_3d)
+                          rotate_xyz_jts, rot_aa, flip_cam_xyz_joints_3d)
 from ..pose_utils import get_intrinsic_metrix
 
 s_coco_2_smpl_jt = [
@@ -105,7 +105,7 @@ class SimpleTransform3DSMPLCam(object):
     def __init__(self, dataset, scale_factor, color_factor, occlusion, add_dpg,
                  input_size, output_size, depth_dim, bbox_3d_shape,
                  rot, sigma, train, loss_type='MSELoss', scale_mult=1.25, focal_length=1000, two_d=False,
-                 root_idx=0, get_paf=False):
+                 root_idx=0):
         if two_d:
             self._joint_pairs = dataset.joint_pairs
         else:
@@ -140,9 +140,6 @@ def __init__(self, dataset, scale_factor, color_factor, occlusion, add_dpg,
         self.focal_length = focal_length
         self.root_idx = root_idx
 
-        self.get_paf = get_paf
-        # self.use_camera = use_camera
-
         if train:
             self.num_joints_half_body = dataset.num_joints_half_body
             self.prob_half_body = dataset.prob_half_body
@@ -454,7 +451,6 @@ def __call__(self, src, label):
             joint_cam_17_xyz = joint_cam_17
             joints_cam_24_xyz = joint_cam_29[:24]
 
-
             if random.random() > 0.75 and self._train:
                 assert src.shape[2] == 3
                 src = src[:, ::-1, :]
@@ -470,8 +466,8 @@ def __call__(self, src, label):
             # rotate global theta
             theta[0, :3] = rot_aa(theta[0, :3], r)
 
-            theta_rot_mat = batch_rodrigues_numpy(theta)
-            theta_quat = rotmat_to_quat_numpy(theta_rot_mat).reshape(24 * 4)
+            theta_rot_mat = batch_rodrigues_numpy(theta).reshape(24 * 9)
+            # theta_quat = rotmat_to_quat_numpy(theta_rot_mat).reshape(24 * 4)
 
             # rotate xyz joints
             joint_cam_17_xyz = rotate_xyz_jts(joint_cam_17_xyz, r)
@@ -497,9 +493,10 @@ def __call__(self, src, label):
                     joints_29_uvd[i, 0:2, 0] = affine_transform(joints_29_uvd[i, 0:2, 0], trans)
             
             target_smpl_weight = torch.ones(1).float()
-            theta_24_weights = np.ones((24, 4))
-
-            theta_24_weights = theta_24_weights.reshape(24 * 4)
+            # theta_24_weights = np.ones((24, 4))
+            # theta_24_weights = theta_24_weights.reshape(24 * 4)
+            theta_24_weights = np.ones((24, 9))
+            theta_24_weights = theta_24_weights.reshape(24 * 9)
 
             # generate training targets
             target_uvd_29, target_weight_29 = self._integral_uvd_target_generator(joints_29_uvd, 29, inp_h, inp_w)
@@ -517,9 +514,9 @@ def __call__(self, src, label):
             if self.focal_length > 0:
                 cam_scale, cam_trans, cam_valid, cam_error, new_uvd = self.calc_cam_scale_trans2(
                                                                 target_xyz_24.reshape(-1, 3).copy(), 
-                                                                tmp_uvd_24.copy(), 
+                                                                tmp_uvd_24.copy(),
                                                                 tmp_uvd_24_weight.copy())
-            
+
                 target_uvd_29 = (target_uvd_29 * target_weight_29).reshape(-1, 3)
             else:
                 cam_scale = 1
@@ -572,7 +569,8 @@ def __call__(self, src, label):
             output = {
                 'type': '3d_data_w_smpl',
                 'image': img,
-                'target_theta': torch.from_numpy(theta_quat).float(),
+                # 'target_theta': torch.from_numpy(theta_quat).float(),
+                'target_theta': torch.from_numpy(theta_rot_mat).float(),
                 'target_theta_weight': torch.from_numpy(theta_24_weights).float(),
                 'target_beta': torch.from_numpy(beta).float(),
                 'target_smpl_weight': target_smpl_weight,
@@ -662,7 +660,7 @@ def calc_cam_scale_trans2(self, xyz_29, uvd_29, uvd_weight):
             # print('bad data')
             return 0, np.zeros(2), 0.0, -1, uvd_29
 
-        xyz_29 = xyz_29 * self.depth_factor2meter # convert to meter
+        xyz_29 = xyz_29 * self.depth_factor2meter  # convert to meter
         new_uvd = uvd_29.copy()
 
         num_joints = len(uvd_29)