diff --git a/README.md b/README.md index 85071d5..3208887 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@
- +
diff --git a/assets/dancer1.gif b/assets/dancer1.gif deleted file mode 100644 index fddf720..0000000 Binary files a/assets/dancer1.gif and /dev/null differ diff --git a/assets/dancer2.gif b/assets/dancer2.gif deleted file mode 100644 index 6b67a5c..0000000 Binary files a/assets/dancer2.gif and /dev/null differ diff --git a/assets/dancer3.gif b/assets/dancer3.gif deleted file mode 100644 index 124e4ff..0000000 Binary files a/assets/dancer3.gif and /dev/null differ diff --git a/assets/hybrik_dance1.gif b/assets/hybrik_dance1.gif new file mode 100644 index 0000000..a6f889c Binary files /dev/null and b/assets/hybrik_dance1.gif differ diff --git a/assets/hybrik_dance2.gif b/assets/hybrik_dance2.gif new file mode 100644 index 0000000..884b19e Binary files /dev/null and b/assets/hybrik_dance2.gif differ diff --git a/assets/hybrik_dance3.gif b/assets/hybrik_dance3.gif new file mode 100644 index 0000000..65dc4de Binary files /dev/null and b/assets/hybrik_dance3.gif differ diff --git a/configs/256x192_adam_lr1e-3-hrw48_cam_2x_w_pw3d_3dhp.yaml b/configs/256x192_adam_lr1e-3-hrw48_cam_2x_w_pw3d_3dhp.yaml index 0d82d33..51946a0 100644 --- a/configs/256x192_adam_lr1e-3-hrw48_cam_2x_w_pw3d_3dhp.yaml +++ b/configs/256x192_adam_lr1e-3-hrw48_cam_2x_w_pw3d_3dhp.yaml @@ -48,13 +48,13 @@ MODEL: - 2200 - 2200 LOSS: - TYPE: 'L1LossDimSMPLCam' + TYPE: 'LaplaceLossDimSMPLCam' ELEMENTS: BETA_WEIGHT: 1 BETA_REG_WEIGHT: 0 PHI_REG_WEIGHT: 0.0001 LEAF_REG_WEIGHT: 0 - TWIST_WEIGHT: 0.01 + TWIST_WEIGHT: 1 THETA_WEIGHT: 0.01 UVD24_WEIGHT: 1 XYZ24_WEIGHT: 0 diff --git a/configs/256x192_adam_lr1e-3-res34_smpl_3d_cam_2x_mix.yaml b/configs/256x192_adam_lr1e-3-res34_smpl_3d_cam_2x_mix.yaml index c97669b..aeba1f7 100644 --- a/configs/256x192_adam_lr1e-3-res34_smpl_3d_cam_2x_mix.yaml +++ b/configs/256x192_adam_lr1e-3-res34_smpl_3d_cam_2x_mix.yaml @@ -53,7 +53,7 @@ LOSS: BETA_REG_WEIGHT: 0 PHI_REG_WEIGHT: 0.0001 LEAF_REG_WEIGHT: 0 - TWIST_WEIGHT: 0.01 + TWIST_WEIGHT: 1 THETA_WEIGHT: 0.01 UVD24_WEIGHT: 1 XYZ24_WEIGHT: 1 diff --git a/configs/256x192_adam_lr1e-3-res34_smpl_3d_cam_2x_mix_w_pw3d.yaml b/configs/256x192_adam_lr1e-3-res34_smpl_3d_cam_2x_mix_w_pw3d.yaml index 17b4919..6f13647 100644 --- a/configs/256x192_adam_lr1e-3-res34_smpl_3d_cam_2x_mix_w_pw3d.yaml +++ b/configs/256x192_adam_lr1e-3-res34_smpl_3d_cam_2x_mix_w_pw3d.yaml @@ -47,13 +47,13 @@ MODEL: - 2200 - 2200 LOSS: - TYPE: 'L1LossDimSMPLCam' + TYPE: 'LaplaceLossDimSMPLCam' ELEMENTS: BETA_WEIGHT: 1 BETA_REG_WEIGHT: 0 PHI_REG_WEIGHT: 0.0001 LEAF_REG_WEIGHT: 0 - TWIST_WEIGHT: 0.01 + TWIST_WEIGHT: 1 THETA_WEIGHT: 0.01 UVD24_WEIGHT: 1 XYZ24_WEIGHT: 0 diff --git a/configs/256x192_adam_lr1e-3-res50_reg_smpl_3d_cam_2x_mix_w_pw3d.yaml b/configs/256x192_adam_lr1e-3-res50_reg_smpl_3d_cam_2x_mix_w_pw3d.yaml index 8d388c2..696e987 100644 --- a/configs/256x192_adam_lr1e-3-res50_reg_smpl_3d_cam_2x_mix_w_pw3d.yaml +++ b/configs/256x192_adam_lr1e-3-res50_reg_smpl_3d_cam_2x_mix_w_pw3d.yaml @@ -53,7 +53,7 @@ LOSS: BETA_REG_WEIGHT: 0 PHI_REG_WEIGHT: 0.0001 LEAF_REG_WEIGHT: 0 - TWIST_WEIGHT: 0.01 + TWIST_WEIGHT: 1 THETA_WEIGHT: 0.01 UVD24_WEIGHT: 1 XYZ24_WEIGHT: 0 diff --git a/hybrik/datasets/cocoeft.py b/hybrik/datasets/cocoeft.py new file mode 100644 index 0000000..1ae10c5 --- /dev/null +++ b/hybrik/datasets/cocoeft.py @@ -0,0 +1,235 @@ +"""MS COCO Human keypoint dataset.""" +import os + +# import scipy.misc +import cv2 +import joblib +import numpy as np +import torch +import torch.utils.data as data +from hybrik.utils.presets.simple_transform_3d_cam_eft import SimpleTransform3DCamEFT +from pytorch3d.transforms.rotation_conversions import matrix_to_axis_angle + +s_coco_2_smpl_jt = [ + -1, 11, 12, + -1, 13, 14, + -1, 15, 16, + -1, -1, -1, + -1, -1, -1, + -1, + 5, 6, + 7, 8, + 9, 10, + -1, -1 +] + + +class COCO_EFT_3D(data.Dataset): + """ COCO Person dataset. + Parameters + ---------- + ann_file: str, + Path to the annotation json file. + root: str, default './data/coco' + Path to the ms coco dataset. + train: bool, default is True + If true, will set as training mode. + skip_empty: bool, default is False + Whether skip entire image if no valid label is found. Use `False` if this dataset is + for validation to avoid COCO metric error. + """ + CLASSES = ['person'] + num_joints = 17 + EVAL_JOINTS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] + joints_name = ('nose', 'left_eye', 'right_eye', 'left_ear', 'right_ear', # 4 + 'left_shoulder', 'right_shoulder', # 6 + 'left_elbow', 'right_elbow', # 8 + 'left_wrist', 'right_wrist', # 10 + 'left_hip', 'right_hip', # 12 + 'left_knee', 'right_knee', # 14 + 'left_ankle', 'right_ankle') # 16 + + def __init__(self, + cfg, + ann_file, + root='./data/coco', + train=True, + skip_empty=True, + dpg=False, + lazy_import=False): + + self._cfg = cfg + self._ann_file = os.path.join(root, 'annotations', ann_file) + self._lazy_import = lazy_import + self._root = root + self._skip_empty = skip_empty + self._train = train + self._dpg = dpg + + self._scale_factor = cfg.DATASET.SCALE_FACTOR + self._color_factor = cfg.DATASET.COLOR_FACTOR + self._rot = cfg.DATASET.ROT_FACTOR + self._input_size = cfg.MODEL.IMAGE_SIZE + self._output_size = cfg.MODEL.HEATMAP_SIZE + + self._occlusion = cfg.DATASET.OCCLUSION + + self._crop = cfg.MODEL.EXTRA.CROP + self._sigma = cfg.MODEL.EXTRA.SIGMA + + self._check_centers = False + + self.num_class = len(self.CLASSES) + + self.num_joints_half_body = cfg.DATASET.NUM_JOINTS_HALF_BODY + self.prob_half_body = cfg.DATASET.PROB_HALF_BODY + + self.augment = cfg.MODEL.EXTRA.AUGMENT + + self._loss_type = cfg.LOSS['TYPE'] + + self.upper_body_ids = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10) + self.lower_body_ids = (11, 12, 13, 14, 15, 16) + + bbox_3d_shape = getattr(cfg.MODEL, 'BBOX_3D_SHAPE', [2200, 2200, 2200]) + # millimeter -> meter + self.bbox_3d_shape = [item * 1e-3 for item in bbox_3d_shape] + + self.transformation = SimpleTransform3DCamEFT( + self, scale_factor=self._scale_factor, + color_factor=self._color_factor, + occlusion=self._occlusion, + input_size=self._input_size, + output_size=self._output_size, + depth_dim=64, + bbox_3d_shape=self.bbox_3d_shape, + rot=self._rot, sigma=self._sigma, + train=self._train, add_dpg=self._dpg, + loss_type=self._loss_type, scale_mult=1.25) + + self.db = self.load_pt() + + def __getitem__(self, idx): + # get image id + img_path = self.db['img_path'][idx] + img_id = int(os.path.splitext(os.path.basename(img_path))[0]) + + # load ground truth, including bbox, keypoints, image size + label = {} + for k in self.db.keys(): + try: + label[k] = self.db[k][idx].copy() + except AttributeError: + label[k] = self.db[k][idx] + + label_new = self.preprocess_pt_item(label, idx) + # img = scipy.misc.imread(img_path, mode='RGB') + src = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB) + # transform ground truth into training label and apply data augmentation + target = self.transformation(src, label_new) + + img = target.pop('image') + bbox = target.pop('bbox') + + return img, target, img_id, bbox + + def __len__(self): + return len(self.db['img_path']) + + def load_pt(self): + db = joblib.load(self._ann_file + '_smpl_annot.pt', 'r') + return db + + @property + def joint_pairs(self): + """Joint pairs which defines the pairs of joint to be swapped + when the image is flipped horizontally.""" + return [[1, 2], [3, 4], [5, 6], [7, 8], + [9, 10], [11, 12], [13, 14], [15, 16]] + + def _get_box_center_area(self, bbox): + """Get bbox center""" + c = np.array([(bbox[0] + bbox[2]) / 2.0, (bbox[1] + bbox[3]) / 2.0]) + area = (bbox[3] - bbox[1]) * (bbox[2] - bbox[0]) + return c, area + + def _get_keypoints_center_count(self, keypoints): + """Get geometric center of all keypoints""" + keypoint_x = np.sum(keypoints[:, 0, 0] * (keypoints[:, 0, 1] > 0)) + keypoint_y = np.sum(keypoints[:, 1, 0] * (keypoints[:, 1, 1] > 0)) + num = float(np.sum(keypoints[:, 0, 1])) + return np.array([keypoint_x / num, keypoint_y / num]), num + + def preprocess_pt_item(self, label, idx): + + # for k, v in label.items(): + # print(k) + beta = label['shape'].copy() + theta = label['pose'].copy().reshape(24, 3, 3) + theta = matrix_to_axis_angle(torch.from_numpy(theta)).numpy() + # scalar + smpl_weight = label['smpl_weight'].copy().reshape(-1) + + joint_cam_17 = label['xyz_17'].reshape((17, 3)) + joint_cam_17 = joint_cam_17 - joint_cam_17[0] + joint_cam_29 = label['xyz_29'].reshape((29, 3)) + joint_cam_29 = joint_cam_29 - joint_cam_29[0] + + joint_img_17 = np.zeros((17, 3)) + joints_vis_17 = np.zeros((17, 3)) * smpl_weight + joint_img_29 = np.zeros((29, 3)) + joints_vis_29 = np.ones((29, 3)) * smpl_weight + joints_vis_xyz_29 = np.ones((29, 3)) * smpl_weight + gt_joints = label['joints_3d'] + + # if smpl_weight[0] < 0.5: + if float(smpl_weight) < 0.5: + for i in range(24): + id1 = i + id2 = s_coco_2_smpl_jt[i] + if id2 >= 0: + joint_img_29[id1, :2] = gt_joints[id2, :2, 0].copy() + joints_vis_29[id1, :2] = gt_joints[id2, :2, 1].copy() + else: + uv_29 = label['uv_29'] + joint_img_29[:, :2] = uv_29 + joint_img_29[:, 2] = joint_cam_29[:, 2] + + twist_angle = label['twist_angle'].reshape(23) + cos = np.cos(twist_angle) + sin = np.sin(twist_angle) + phi = np.stack((cos, sin), axis=1) + phi_weight = np.ones_like(phi) * smpl_weight[0] + + flag = (twist_angle < -10) + phi_weight[flag, :] = 0 + + root_cam = joint_cam_29[0] + + f = np.array([1000.0, 1000.0]) + c = np.array([128.0, 128.0]) + + return_label = { + 'bbox': label['bbox'], + 'img_id': idx, + 'img_path': label['img_path'], + 'img_name': label['img_path'], + 'joint_img_17': joint_img_17, + 'joint_vis_17': joints_vis_17, + 'joint_cam_17': joint_cam_17, + 'joint_relative_17': joint_cam_17, + 'joint_img_29': joint_img_29, + 'joint_vis_29': joints_vis_29, + 'joint_vis_xyz_29': joints_vis_xyz_29, + 'joint_cam_29': joint_cam_29, + 'twist_phi': phi, + 'twist_weight': phi_weight, + 'beta': beta, + 'theta': theta, + 'root_cam': root_cam, + 'f': f, + 'c': c, + 'smpl_weight': smpl_weight + } + + return return_label diff --git a/hybrik/datasets/mix_dataset2_cam.py b/hybrik/datasets/mix_dataset2_cam.py index a3ebe95..68a0cd1 100644 --- a/hybrik/datasets/mix_dataset2_cam.py +++ b/hybrik/datasets/mix_dataset2_cam.py @@ -32,7 +32,7 @@ 9, 14, 10, 15, 11, 16, - -1, -1, # 23 + -1, -1, # 23 # 7, # -1, -1, # 21, 26 @@ -253,10 +253,12 @@ def __getitem__(self, idx): target['target_weight_29'] = label_uvd_29_mask target['target_xyz_17'] = label_xyz_17 target['target_weight_17'] = label_xyz_17_mask - target['target_theta'] = torch.zeros(24 * 4) + # target['target_theta'] = torch.zeros(24 * 4) + target['target_theta'] = torch.zeros(24 * 9) target['target_beta'] = torch.zeros(10) target['target_smpl_weight'] = torch.zeros(1) - target['target_theta_weight'] = torch.zeros(24 * 4) + # target['target_theta_weight'] = torch.zeros(24 * 4) + target['target_theta_weight'] = torch.zeros(24 * 9) target['target_twist'] = torch.zeros(23, 2) target['target_twist_weight'] = torch.zeros(23, 2) target['target_xyz_weight_24'] = label_xyz_24_mask diff --git a/hybrik/datasets/mix_dataset_cam.py b/hybrik/datasets/mix_dataset_cam.py index 0bff3a6..75af4c4 100644 --- a/hybrik/datasets/mix_dataset_cam.py +++ b/hybrik/datasets/mix_dataset_cam.py @@ -260,10 +260,12 @@ def __getitem__(self, idx): target['target_weight_29'] = label_uvd_29_mask target['target_xyz_17'] = label_xyz_17 target['target_weight_17'] = label_xyz_17_mask - target['target_theta'] = torch.zeros(24 * 4) + # target['target_theta'] = torch.zeros(24 * 4) + target['target_theta'] = torch.zeros(24 * 9) target['target_beta'] = torch.zeros(10) target['target_smpl_weight'] = torch.zeros(1) - target['target_theta_weight'] = torch.zeros(24 * 4) + # target['target_theta_weight'] = torch.zeros(24 * 4) + target['target_theta_weight'] = torch.zeros(24 * 9) target['target_twist'] = torch.zeros(23, 2) target['target_twist_weight'] = torch.zeros(23, 2) target['target_xyz_weight_24'] = label_xyz_24_mask diff --git a/hybrik/models/HRNetWithCam.py b/hybrik/models/HRNetWithCam.py index 709c028..6158850 100644 --- a/hybrik/models/HRNetWithCam.py +++ b/hybrik/models/HRNetWithCam.py @@ -332,7 +332,7 @@ def forward(self, x, flip_test=False, **kwargs): pred_xyz_jts_24_struct = output.joints.float() / self.depth_factor # -0.5 ~ 0.5 pred_xyz_jts_17 = output.joints_from_verts.float() / self.depth_factor - pred_theta_mats = output.rot_mats.float().reshape(batch_size, 24 * 4) + pred_theta_mats = output.rot_mats.float().reshape(batch_size, 24 * 9) pred_xyz_jts_24 = pred_xyz_jts_29[:, :24, :].reshape(batch_size, 72) pred_xyz_jts_24_struct = pred_xyz_jts_24_struct.reshape(batch_size, 72) pred_xyz_jts_17_flat = pred_xyz_jts_17.reshape(batch_size, 17 * 3) diff --git a/hybrik/models/HRNetWithCamReg.py b/hybrik/models/HRNetWithCamReg.py index 1aea466..de3db7c 100644 --- a/hybrik/models/HRNetWithCamReg.py +++ b/hybrik/models/HRNetWithCamReg.py @@ -351,7 +351,7 @@ def forward(self, x, flip_test=False, **kwargs): pred_xyz_jts_24_struct = output.joints.float() / self.depth_factor # -0.5 ~ 0.5 pred_xyz_jts_17 = output.joints_from_verts.float() / self.depth_factor - pred_theta_mats = output.rot_mats.float().reshape(batch_size, 24 * 4) + pred_theta_mats = output.rot_mats.float().reshape(batch_size, 24 * 9) pred_xyz_jts_24 = pred_xyz_jts_29[:, :24, :].reshape(batch_size, 72) pred_xyz_jts_24_struct = pred_xyz_jts_24_struct.reshape(batch_size, 72) pred_xyz_jts_17_flat = pred_xyz_jts_17.reshape(batch_size, 17 * 3) diff --git a/hybrik/models/criterion.py b/hybrik/models/criterion.py index 4ee6844..327e2c4 100644 --- a/hybrik/models/criterion.py +++ b/hybrik/models/criterion.py @@ -22,7 +22,7 @@ def weighted_l1_loss(input, target, weights, size_average): def weighted_laplace_loss(input, sigma, target, weights, size_average): input = input target = target - out = torch.log(sigma / amp) + torch.abs(input - target) / (math.sqrt(2) * sigma + 1e-9) + out = torch.log(sigma / amp) + torch.abs(input - target) / (math.sqrt(2) * sigma + 1e-5) out = out * weights if size_average and weights.sum() > 0: return out.sum() / weights.sum() diff --git a/hybrik/models/layers/smpl/SMPL.py b/hybrik/models/layers/smpl/SMPL.py index 6fc8864..6095945 100644 --- a/hybrik/models/layers/smpl/SMPL.py +++ b/hybrik/models/layers/smpl/SMPL.py @@ -261,7 +261,7 @@ def hybrik(self, leaf_thetas=leaf_thetas) rot_mats = rot_mats.reshape(batch_size * 24, 3, 3) - rot_mats = rotmat_to_quat(rot_mats).reshape(batch_size, 24 * 4) + # rot_mats = rotmat_to_quat(rot_mats).reshape(batch_size, 24 * 4) if transl is not None: new_joints += transl.unsqueeze(dim=1) diff --git a/hybrik/models/layers/smpl/lbs.py b/hybrik/models/layers/smpl/lbs.py index bf45101..9c13fa5 100644 --- a/hybrik/models/layers/smpl/lbs.py +++ b/hybrik/models/layers/smpl/lbs.py @@ -991,6 +991,12 @@ def batch_get_pelvis_orient(rel_pose_skeleton, rel_rest_pose, parents, children, spine_norm = torch.norm(spine_final_loc, dim=1, keepdim=True) spine_norm = spine_final_loc / (spine_norm + 1e-8) + assert torch.sum(torch.isnan(spine_rest_loc) + ) == 0, ('spine_rest_loc', spine_rest_loc) + + assert torch.sum(torch.isnan(spine_final_loc) + ) == 0, ('spine_final_loc', spine_final_loc) + rot_mat_spine = vectors2rotmat(spine_rest_loc, spine_final_loc, dtype) assert torch.sum(torch.isnan(rot_mat_spine) diff --git a/hybrik/models/simple3dposeSMPLWithCam.py b/hybrik/models/simple3dposeSMPLWithCam.py index d4aa633..1839f98 100644 --- a/hybrik/models/simple3dposeSMPLWithCam.py +++ b/hybrik/models/simple3dposeSMPLWithCam.py @@ -98,8 +98,8 @@ def __init__(self, norm_layer=nn.BatchNorm2d, **kwargs): init_cam = torch.tensor([0.9, 0, 0]) self.register_buffer( 'init_cam', - torch.Tensor(init_cam).float()) - + torch.Tensor(init_cam).float()) + self.avg_pool = nn.AdaptiveAvgPool2d(1) self.fc1 = nn.Linear(self.feature_channel, 1024) self.drop1 = nn.Dropout(p=0.5) @@ -108,6 +108,7 @@ def __init__(self, norm_layer=nn.BatchNorm2d, **kwargs): self.decshape = nn.Linear(1024, 10) self.decphi = nn.Linear(1024, 23 * 2) # [cos(phi), sin(phi)] self.deccam = nn.Linear(1024, 3) + self.decsigma = nn.Linear(1024, 29) self.focal_length = kwargs['FOCAL_LENGTH'] self.bbox_3d_shape = kwargs['BBOX_3D_SHAPE'] if 'BBOX_3D_SHAPE' in kwargs else (2000, 2000, 2000) @@ -238,7 +239,7 @@ def forward(self, x, flip_test=False, **kwargs): x0 = self.avg_pool(x0) x0 = x0.view(x0.size(0), -1) init_shape = self.init_shape.expand(batch_size, -1) # (B, 10,) - init_cam = self.init_cam.expand(batch_size, -1) # (B, 3,) + init_cam = self.init_cam.expand(batch_size, -1) # (B, 3,) xc = x0 @@ -251,6 +252,7 @@ def forward(self, x, flip_test=False, **kwargs): pred_shape = delta_shape + init_shape pred_phi = self.decphi(xc) pred_camera = self.deccam(xc).reshape(batch_size, -1) + init_cam + sigma = self.decsigma(xc).reshape(batch_size, 29, 1).sigmoid() pred_phi = pred_phi.reshape(batch_size, 23, 2) @@ -267,6 +269,8 @@ def forward(self, x, flip_test=False, **kwargs): flip_pred_shape = flip_delta_shape + init_shape flip_pred_phi = self.decphi(flip_xc) flip_pred_camera = self.deccam(flip_xc).reshape(batch_size, -1) + init_cam + flip_sigma = self.decsigma(flip_x0).reshape( + batch_size, 29, 1).sigmoid() pred_shape = (pred_shape + flip_pred_shape) / 2 @@ -277,6 +281,9 @@ def forward(self, x, flip_test=False, **kwargs): flip_pred_camera[:, 1] = -flip_pred_camera[:, 1] pred_camera = (pred_camera + flip_pred_camera) / 2 + flip_sigma = self.flip_sigma(flip_sigma) + sigma = (sigma + flip_sigma) / 2 + camScale = pred_camera[:, :1].unsqueeze(1) camTrans = pred_camera[:, 1:].unsqueeze(1) @@ -309,8 +316,7 @@ def forward(self, x, flip_test=False, **kwargs): camera_root[:, 2] += camDepth[:, 0, 0] else: pred_xyz_jts_29[:, :, 2:] = pred_uvd_jts_29[:, :, 2:].clone() # unit: (self.depth_factor m) - pred_xyz_jts_29_meter = (pred_uvd_jts_29[:, :, :2] * self.input_size / self.focal_length) \ - * (pred_xyz_jts_29[:, :, 2:]*self.depth_factor + camDepth) - camTrans # unit: m + pred_xyz_jts_29_meter = (pred_uvd_jts_29[:, :, :2] * self.input_size / self.focal_length) * (pred_xyz_jts_29[:, :, 2:] * self.depth_factor + camDepth) - camTrans # unit: m pred_xyz_jts_29[:, :, :2] = pred_xyz_jts_29_meter / self.depth_factor # unit: (self.depth_factor m) @@ -322,7 +328,7 @@ def forward(self, x, flip_test=False, **kwargs): pred_xyz_jts_29_flat = pred_xyz_jts_29.reshape(batch_size, -1) output = self.smpl.hybrik( - pose_skeleton=pred_xyz_jts_29.type(self.smpl_dtype) * self.depth_factor, # unit: meter + pose_skeleton=pred_xyz_jts_29.type(self.smpl_dtype) * self.depth_factor, # unit: meter betas=pred_shape.type(self.smpl_dtype), phis=pred_phi.type(self.smpl_dtype), global_orient=None, @@ -333,7 +339,7 @@ def forward(self, x, flip_test=False, **kwargs): pred_xyz_jts_24_struct = output.joints.float() / self.depth_factor # -0.5 ~ 0.5 pred_xyz_jts_17 = output.joints_from_verts.float() / self.depth_factor - pred_theta_mats = output.rot_mats.float().reshape(batch_size, 24 * 4) + pred_theta_mats = output.rot_mats.float().reshape(batch_size, 24 * 9) pred_xyz_jts_24 = pred_xyz_jts_29[:, :24, :].reshape(batch_size, 72) pred_xyz_jts_24_struct = pred_xyz_jts_24_struct.reshape(batch_size, 72) pred_xyz_jts_17_flat = pred_xyz_jts_17.reshape(batch_size, 17 * 3) @@ -356,6 +362,8 @@ def forward(self, x, flip_test=False, **kwargs): cam_trans=camTrans[:, 0], cam_root=camera_root, transl=transl, + pred_sigma=sigma, + scores=1 - sigma, # uvd_heatmap=torch.stack([hm_x0, hm_y0, hm_z0], dim=2), # uvd_heatmap=heatmaps, # img_feat=x0 diff --git a/hybrik/models/simple3dposeSMPLWithCamReg.py b/hybrik/models/simple3dposeSMPLWithCamReg.py index 9ed4543..f59c0aa 100644 --- a/hybrik/models/simple3dposeSMPLWithCamReg.py +++ b/hybrik/models/simple3dposeSMPLWithCamReg.py @@ -256,7 +256,7 @@ def forward(self, x, flip_test=False, **kwargs): pred_xyz_jts_24_struct = output.joints.float() / self.depth_factor # -0.5 ~ 0.5 pred_xyz_jts_17 = output.joints_from_verts.float() / self.depth_factor - pred_theta_mats = output.rot_mats.float().reshape(batch_size, 24 * 4) + pred_theta_mats = output.rot_mats.float().reshape(batch_size, 24 * 9) pred_xyz_jts_24 = pred_xyz_jts_29[:, :24, :].reshape(batch_size, 72) pred_xyz_jts_24_struct = pred_xyz_jts_24_struct.reshape(batch_size, 72) pred_xyz_jts_17_flat = pred_xyz_jts_17.reshape(batch_size, 17 * 3) diff --git a/hybrik/utils/presets/simple_transform_3d_cam_eft.py b/hybrik/utils/presets/simple_transform_3d_cam_eft.py new file mode 100644 index 0000000..7053b0b --- /dev/null +++ b/hybrik/utils/presets/simple_transform_3d_cam_eft.py @@ -0,0 +1,592 @@ +import math +import random + +import cv2 +import numpy as np +import torch + +from ..bbox import _box_to_center_scale, _center_scale_to_box +from ..transforms import (addDPG, affine_transform, flip_joints_3d, flip_thetas, flip_xyz_joints_3d, + get_affine_transform, im_to_torch, batch_rodrigues_numpy, flip_twist, + rotmat_to_quat_numpy, rotate_xyz_jts, rot_aa, flip_cam_xyz_joints_3d) +from ..pose_utils import get_intrinsic_metrix + +s_coco_2_smpl_jt = [ + -1, 11, 12, + -1, 13, 14, + -1, 15, 16, + -1, -1, -1, + -1, -1, -1, + -1, + 5, 6, + 7, 8, + 9, 10, + -1, -1 +] + +smpl_parents = [-1, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 12, 13, 14, + 16, 17, 18, 19, 20, 21] + + +left_bones_idx = [ + (0, 1), (1, 4), (4, 7), (12, 13), + (13, 16), (16, 18), (18, 20) +] + +right_bones_idx = [ + (0, 2), (2, 5), (5, 8), (12, 14), + (14, 17), (17, 19), (19, 21) +] + +skeleton_29 = [ + (0, 1), (0, 2), (0, 3), (1, 4), (2, 5), (3, 6), # 5 + (4, 7), (5, 8), (6, 9), (7, 10), (8, 11), (9, 12), # 11 + (9, 13), (9, 14), (12, 15), (13, 16), (14, 17), (16, 18), # 17 + (17, 19), (18, 20), (19, 21), (20, 22), (21, 23), (15, 24), # 23 + (22, 25), (23, 26), (10, 27), (11, 28) # 27 +] + +skeleton_3dhp = np.array([(-1, -1)] * 28).astype(int) +skeleton_3dhp[ [6, 7, 17, 18, 19, 20] ] = np.array([ + (19, 20), (24, 25), (9, 10), (14, 15), (10, 11), (15, 16) + ]).astype(int) + + +class SimpleTransform3DCamEFT(object): + """Generation of cropped input person, pose coords, smpl parameters. + Parameters + ---------- + img: torch.Tensor + A tensor with shape: `(3, h, w)`. + label: dict + A dictionary with 4 keys: + `bbox`: [xmin, ymin, xmax, ymax] + `joints_3d`: numpy.ndarray with shape: (n_joints, 2), + including position and visible flag + `width`: image width + `height`: image height + dataset: + The dataset to be transformed, must include `joint_pairs` property for flipping. + scale_factor: int + Scale augmentation. + input_size: tuple + Input image size, as (height, width). + output_size: tuple + Heatmap size, as (height, width). + rot: int + Ratation augmentation. + train: bool + True for training trasformation. + """ + + def __init__(self, dataset, scale_factor, color_factor, occlusion, add_dpg, + input_size, output_size, depth_dim, bbox_3d_shape, + rot, sigma, train, loss_type='MSELoss', scale_mult=1.25, focal_length=1000, two_d=False, + root_idx=0): + + self._joint_pairs_17 = ((1, 4), (2, 5), (3, 6), (11, 14), (12, 15), (13, 16)) + self._joint_pairs_24 = ((1, 2), (4, 5), (7, 8), (10, 11), (13, 14), (16, 17), (18, 19), (20, 21), (22, 23)) + self._joint_pairs_29 = ((1, 2), (4, 5), (7, 8), (10, 11), (13, 14), (16, 17), (18, 19), (20, 21), (22, 23), (25, 26), (27, 28)) + + self._scale_factor = scale_factor + self._color_factor = color_factor + self._occlusion = occlusion + self._rot = rot + self._add_dpg = add_dpg + + self._input_size = input_size + self._heatmap_size = output_size + + self._sigma = sigma + self._train = train + self._loss_type = loss_type + self._aspect_ratio = float(input_size[1]) / input_size[0] # w / h + self._feat_stride = np.array(input_size) / np.array(output_size) + + self.pixel_std = 1 + + self.bbox_3d_shape = dataset.bbox_3d_shape + self._scale_mult = scale_mult + self.two_d = two_d + + # convert to unit: meter + self.depth_factor2meter = self.bbox_3d_shape[2] if self.bbox_3d_shape[2] < 50 else self.bbox_3d_shape[2]*1e-3 + + self.focal_length = focal_length + self.root_idx = root_idx + + if train: + self.num_joints_half_body = dataset.num_joints_half_body + self.prob_half_body = dataset.prob_half_body + + self.upper_body_ids = dataset.upper_body_ids + self.lower_body_ids = dataset.lower_body_ids + + def test_transform(self, src, bbox): + xmin, ymin, xmax, ymax = bbox + center, scale = _box_to_center_scale( + xmin, ymin, xmax - xmin, ymax - ymin, self._aspect_ratio, scale_mult=self._scale_mult) + scale = scale * 1.0 + + input_size = self._input_size + inp_h, inp_w = input_size + trans = get_affine_transform(center, scale, 0, [inp_w, inp_h]) + img = cv2.warpAffine(src, trans, (int(inp_w), int(inp_h)), flags=cv2.INTER_LINEAR) + bbox = _center_scale_to_box(center, scale) + + img = im_to_torch(img) + # mean + img[0].add_(-0.406) + img[1].add_(-0.457) + img[2].add_(-0.480) + + # std + img[0].div_(0.225) + img[1].div_(0.224) + img[2].div_(0.229) + + img_center = np.array([float(src.shape[1]) * 0.5, float(src.shape[0]) * 0.5]) + + return img, bbox, img_center + + def _integral_target_generator(self, joints_3d, num_joints, patch_height, patch_width): + target_weight = np.ones((num_joints, 3), dtype=np.float32) + target_weight[:, 0] = joints_3d[:, 0, 1] + target_weight[:, 1] = joints_3d[:, 0, 1] + target_weight[:, 2] = joints_3d[:, 0, 1] + + target = np.zeros((num_joints, 3), dtype=np.float32) + target[:, 0] = joints_3d[:, 0, 0] / patch_width - 0.5 + target[:, 1] = joints_3d[:, 1, 0] / patch_height - 0.5 + target[:, 2] = joints_3d[:, 2, 0] / self.bbox_3d_shape[0] + + target_weight[target[:, 0] > 0.5] = 0 + target_weight[target[:, 0] < -0.5] = 0 + target_weight[target[:, 1] > 0.5] = 0 + target_weight[target[:, 1] < -0.5] = 0 + target_weight[target[:, 2] > 0.5] = 0 + target_weight[target[:, 2] < -0.5] = 0 + + target = target.reshape((-1)) + target_weight = target_weight.reshape((-1)) + return target, target_weight + + def _integral_uvd_target_generator(self, joints_3d, num_joints, patch_height, patch_width): + + target_weight = np.ones((num_joints, 3), dtype=np.float32) + target_weight[:, 0] = joints_3d[:, 0, 1] + target_weight[:, 1] = joints_3d[:, 0, 1] + target_weight[:, 2] = joints_3d[:, 0, 1] + + target = np.zeros((num_joints, 3), dtype=np.float32) + target[:, 0] = joints_3d[:, 0, 0] / patch_width - 0.5 + target[:, 1] = joints_3d[:, 1, 0] / patch_height - 0.5 + target[:, 2] = joints_3d[:, 2, 0] / self.bbox_3d_shape[2] + + target_weight[target[:, 0] > 0.5] = 0 + target_weight[target[:, 0] < -0.5] = 0 + target_weight[target[:, 1] > 0.5] = 0 + target_weight[target[:, 1] < -0.5] = 0 + target_weight[target[:, 2] > 0.5] = 0 + target_weight[target[:, 2] < -0.5] = 0 + + target = target.reshape((-1)) + target_weight = target_weight.reshape((-1)) + return target, target_weight + + def _integral_xyz_target_generator(self, joints_3d, joints_3d_vis, num_joints): + target_weight = np.ones((num_joints, 3), dtype=np.float32) + target_weight[:, 0] = joints_3d_vis[:, 0] + target_weight[:, 1] = joints_3d_vis[:, 1] + target_weight[:, 2] = joints_3d_vis[:, 2] + + target = np.zeros((num_joints, 3), dtype=np.float32) + target[:, 0] = joints_3d[:, 0] / self.bbox_3d_shape[0] + target[:, 1] = joints_3d[:, 1] / self.bbox_3d_shape[1] + target[:, 2] = joints_3d[:, 2] / self.bbox_3d_shape[2] + + # if self.bbox_3d_shape[0] < 1000: + # print(self.bbox_3d_shape, target) + + # assert (target[0] == 0).all(), f'{target}, {self.bbox_3d_shape}' + + target = target.reshape((-1)) + target_weight = target_weight.reshape((-1)) + return target, target_weight + + def __call__(self, src, label): + if self.two_d: + assert NotImplementedError + else: + bbox = list(label['bbox']) + joint_img_17 = label['joint_img_17'].copy() + joint_relative_17 = label['joint_relative_17'].copy() + joint_cam_17 = label['joint_cam_17'].copy() + joints_vis_17 = label['joint_vis_17'].copy() + joint_img_29 = label['joint_img_29'].copy() + joint_cam_29 = label['joint_cam_29'].copy() + joints_vis_29 = label['joint_vis_29'].copy() + joints_vis_xyz_29 = label['joint_vis_xyz_29'].copy() + smpl_weight = label['smpl_weight'].copy() + # root_cam = label['root_cam'].copy() + # root_depth = root_cam[2] / self.bbox_3d_shape[2] + self.num_joints = joint_img_29.shape[0] + + beta = label['beta'].copy() + theta = label['theta'].copy() + + beta_kid = label['beta_kid'].copy() if 'beta_kid' in label else np.zeros(1) + + # assert not (theta < 1e-3).all(), label + + if 'twist_phi' in label.keys(): + twist_phi = label['twist_phi'].copy() + twist_weight = label['twist_weight'].copy() + else: + twist_phi = np.zeros((23, 2)) + twist_weight = np.zeros((23, 2)) + + gt_joints_17 = np.zeros((17, 3, 2), dtype=np.float32) + gt_joints_17[:, :, 0] = joint_img_17.copy() + gt_joints_17[:, :, 1] = joints_vis_17.copy() + gt_joints_29 = np.zeros((29, 3, 2), dtype=np.float32) + gt_joints_29[:, :, 0] = joint_img_29.copy() + gt_joints_29[:, :, 1] = joints_vis_29.copy() + + imgwidth, imght = src.shape[1], src.shape[0] + + input_size = self._input_size + + if self._add_dpg and self._train: + bbox = addDPG(bbox, imgwidth, imght) + + xmin, ymin, xmax, ymax = bbox + center, scale = _box_to_center_scale( + xmin, ymin, xmax - xmin, ymax - ymin, self._aspect_ratio, scale_mult=self._scale_mult) + + xmin, ymin, xmax, ymax = _center_scale_to_box(center, scale) + + # half body transform + if self._train and (np.sum(joints_vis_17[:, 0]) > self.num_joints_half_body and np.random.rand() < self.prob_half_body): + c_half_body, s_half_body = self.half_body_transform( + gt_joints_17[:, :, 0], joints_vis_17 + ) + + if c_half_body is not None and s_half_body is not None: + center, scale = c_half_body, s_half_body + + # rescale + if self._train: + sf = self._scale_factor + scale = scale * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf) + else: + scale = scale * 1.0 + + # rotation + if self._train: + rf = self._rot + r = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) if random.random() <= 0.6 else 0 + else: + r = 0 + + if self._train and self._occlusion: + while True: + area_min = 0.0 + area_max = 0.3 + synth_area = (random.random() * (area_max - area_min) + area_min) * (xmax - xmin) * (ymax - ymin) + + ratio_min = 0.5 + ratio_max = 1 / 0.5 + synth_ratio = (random.random() * (ratio_max - ratio_min) + ratio_min) + + synth_h = math.sqrt(synth_area * synth_ratio) + synth_w = math.sqrt(synth_area / synth_ratio) + synth_xmin = random.random() * ((xmax - xmin) - synth_w - 1) + xmin + synth_ymin = random.random() * ((ymax - ymin) - synth_h - 1) + ymin + + if synth_xmin >= 0 and synth_ymin >= 0 and synth_xmin + synth_w < imgwidth and synth_ymin + synth_h < imght: + synth_xmin = int(synth_xmin) + synth_ymin = int(synth_ymin) + synth_w = int(synth_w) + synth_h = int(synth_h) + src[synth_ymin:synth_ymin + synth_h, synth_xmin:synth_xmin + synth_w, :] = np.random.rand(synth_h, synth_w, 3) * 255 + break + + joints_17_uvd = gt_joints_17 + joints_29_uvd = gt_joints_29 + + joint_cam_17_xyz = joint_cam_17 + joints_cam_24_xyz = joint_cam_29[:24] + + if random.random() > 0.75 and self._train: + assert src.shape[2] == 3 + src = src[:, ::-1, :] + + joints_17_uvd = flip_joints_3d(joints_17_uvd, imgwidth, self._joint_pairs_17) + joints_29_uvd = flip_joints_3d(joints_29_uvd, imgwidth, self._joint_pairs_29) + joint_cam_17_xyz = flip_cam_xyz_joints_3d(joint_cam_17_xyz, self._joint_pairs_17) + joints_cam_24_xyz = flip_cam_xyz_joints_3d(joints_cam_24_xyz, self._joint_pairs_24) + theta = flip_thetas(theta, self._joint_pairs_24) + twist_phi, twist_weight = flip_twist(twist_phi, twist_weight, self._joint_pairs_24) + center[0] = imgwidth - center[0] - 1 + + # rotate global theta + theta[0, :3] = rot_aa(theta[0, :3], r) + + theta_rot_mat = batch_rodrigues_numpy(theta).reshape(24 * 9) + # theta_quat = rotmat_to_quat_numpy(theta_rot_mat).reshape(24 * 4) + + # rotate xyz joints + joint_cam_17_xyz = rotate_xyz_jts(joint_cam_17_xyz, r) + joints_17_xyz = joint_cam_17_xyz - joint_cam_17_xyz[:1].copy() + joints_cam_24_xyz = rotate_xyz_jts(joints_cam_24_xyz, r) + joints_24_xyz = joints_cam_24_xyz - joints_cam_24_xyz[:1].copy() + + inp_h, inp_w = input_size + trans = get_affine_transform(center, scale, r, [inp_w, inp_h]) + trans_inv = get_affine_transform(center, scale, r, [inp_w, inp_h], inv=True).astype(np.float32) + intrinsic_param = get_intrinsic_metrix(label['f'], label['c'], inv=True).astype(np.float32) if 'f' in label.keys() else np.zeros((3, 3)).astype(np.float32) + joint_root = label['root_cam'].astype(np.float32) if 'root_cam' in label.keys() else np.zeros((3)).astype(np.float32) + depth_factor = np.array([self.bbox_3d_shape[2]]).astype(np.float32) if self.bbox_3d_shape else np.zeros((1)).astype(np.float32) + + img = cv2.warpAffine(src, trans, (int(inp_w), int(inp_h)), flags=cv2.INTER_LINEAR) + # affine transform + for i in range(17): + if joints_17_uvd[i, 0, 1] > 0.0: + joints_17_uvd[i, 0:2, 0] = affine_transform(joints_17_uvd[i, 0:2, 0], trans) + + for i in range(29): + if joints_29_uvd[i, 0, 1] > 0.0: + joints_29_uvd[i, 0:2, 0] = affine_transform(joints_29_uvd[i, 0:2, 0], trans) + + target_smpl_weight = torch.ones(1).float() * smpl_weight + theta_24_weights = np.ones((24, 9)) * smpl_weight + + theta_24_weights = theta_24_weights.reshape(24 * 9) + + # generate training targets + target_uvd_29, target_weight_29 = self._integral_uvd_target_generator(joints_29_uvd, 29, inp_h, inp_w) + target_xyz_17, target_weight_17 = self._integral_xyz_target_generator(joints_17_xyz, joints_vis_17, 17) + target_xyz_24, target_weight_24 = self._integral_xyz_target_generator(joints_24_xyz, joints_vis_29[:24, :], 24) + + target_weight_29 *= joints_vis_29.reshape(-1) + target_weight_24 *= joints_vis_xyz_29[:24, :].reshape(-1) + target_weight_17 *= joints_vis_17.reshape(-1) + bbox = _center_scale_to_box(center, scale) + + tmp_uvd_24 = target_uvd_29.reshape(-1, 3)[:24] + tmp_uvd_24_weight = target_weight_29.reshape(-1, 3)[:24] * target_weight_24.reshape(-1, 3) + + if self.focal_length > 0: + cam_scale, cam_trans, cam_valid, cam_error, new_uvd = self.calc_cam_scale_trans2( + target_xyz_24.reshape(-1, 3).copy(), + tmp_uvd_24.copy(), + tmp_uvd_24_weight.copy()) + + # target_uvd_29 = (target_uvd_29 * target_weight_29).reshape(-1, 3) + else: + cam_scale = 1 + cam_trans = np.zeros(2) + cam_valid = 0 + cam_error = 0 + + assert img.shape[2] == 3 + if self._train: + c_high = 1 + self._color_factor + c_low = 1 - self._color_factor + img[:, :, 0] = np.clip(img[:, :, 0] * random.uniform(c_low, c_high), 0, 255) + img[:, :, 1] = np.clip(img[:, :, 1] * random.uniform(c_low, c_high), 0, 255) + img[:, :, 2] = np.clip(img[:, :, 2] * random.uniform(c_low, c_high), 0, 255) + + img = im_to_torch(img) + # mean + img[0].add_(-0.406) + img[1].add_(-0.457) + img[2].add_(-0.480) + + # std + img[0].div_(0.225) + img[1].div_(0.224) + img[2].div_(0.229) + + img_center = np.array([float(imgwidth) * 0.5, float(imght) * 0.5]) + + # target_weight_29 = target_weight_29.reshape(29, 3) + # target_weight_29[:, 2] = 0 + # target_weight_29 = target_weight_29.reshape(-1) + + output = { + 'type': '3d_data_w_smpl', + 'image': img, + # 'target_theta': torch.from_numpy(theta_quat).float(), + 'target_theta': torch.from_numpy(theta_rot_mat).float(), + 'target_theta_weight': torch.from_numpy(theta_24_weights).float(), + 'target_beta': torch.from_numpy(beta).float(), + 'target_smpl_weight': target_smpl_weight, + 'target_uvd_29': torch.from_numpy(target_uvd_29.reshape(-1)).float(), + 'target_xyz_24': torch.from_numpy(target_xyz_24).float(), + 'target_weight_29': torch.from_numpy(target_weight_29).float(), + 'target_weight_24': torch.from_numpy(target_weight_24).float(), + 'target_xyz_17': torch.from_numpy(target_xyz_17).float(), + 'target_weight_17': torch.from_numpy(target_weight_17).float(), + 'target_xyz_weight_24': torch.from_numpy(target_weight_24).float(), + 'trans_inv': torch.from_numpy(trans_inv).float(), + 'intrinsic_param': torch.from_numpy(intrinsic_param).float(), + 'joint_root': torch.from_numpy(joint_root).float(), + 'depth_factor': torch.from_numpy(depth_factor).float(), + 'bbox': torch.Tensor(bbox), + 'target_twist': torch.from_numpy(twist_phi).float(), + 'target_twist_weight': torch.from_numpy(twist_weight).float(), + 'camera_scale': torch.from_numpy(np.array([cam_scale])).float(), + 'camera_trans': torch.from_numpy(cam_trans).float(), + 'camera_valid': cam_valid, + 'camera_error': cam_error, + 'img_center': torch.from_numpy(img_center).float(), + 'target_beta_kid': torch.from_numpy(beta_kid).float(), + } + + return output + + def half_body_transform(self, joints, joints_vis): + upper_joints = [] + lower_joints = [] + for joint_id in range(self.num_joints): + if joints_vis[joint_id][0] > 0: + if joint_id in self.upper_body_ids: + upper_joints.append(joints[joint_id]) + else: + lower_joints.append(joints[joint_id]) + + if np.random.randn() < 0.5 and len(upper_joints) > 2: + selected_joints = upper_joints + else: + selected_joints = lower_joints \ + if len(lower_joints) > 2 else upper_joints + + if len(selected_joints) < 2: + return None, None + + selected_joints = np.array(selected_joints, dtype=np.float32) + center = selected_joints.mean(axis=0)[:2] + + left_top = np.amin(selected_joints, axis=0) + right_bottom = np.amax(selected_joints, axis=0) + + w = right_bottom[0] - left_top[0] + h = right_bottom[1] - left_top[1] + + if w > self._aspect_ratio * h: + h = w * 1.0 / self._aspect_ratio + elif w < self._aspect_ratio * h: + w = h * self._aspect_ratio + + scale = np.array( + [ + w * 1.0 / self.pixel_std, + h * 1.0 / self.pixel_std + ], + dtype=np.float32 + ) + + scale = scale * 1.5 + + return center, scale + + def calc_cam_scale_trans2(self, xyz_29, uvd_29, uvd_weight): + + f = self.focal_length + + # unit: meter + # the equation to be solved: + # u * 256 / f * (z + f/256 * 1/scale) = x + tx + # v * 256 / f * (z + f/256 * 1/scale) = y + ty + + weight = (uvd_weight.sum(axis=-1, keepdims=True) >= 3.0) * 1.0 # 24 x 1 + # assert weight.sum() >= 2, 'too few valid keypoints to calculate cam para' + + if weight.sum() < 2: + # print('bad data') + return 0, np.zeros(2), 0.0, -1, uvd_29 + + xyz_29 = xyz_29 * self.depth_factor2meter # convert to meter + new_uvd = uvd_29.copy() + + num_joints = len(uvd_29) + + Ax = np.zeros((num_joints, 3)) + Ax[:, 1] = -1 + Ax[:, 0] = uvd_29[:, 0] + + Ay = np.zeros((num_joints, 3)) + Ay[:, 2] = -1 + Ay[:, 0] = uvd_29[:, 1] + + Ax = Ax * weight + Ay = Ay * weight + + A = np.concatenate([Ax, Ay], axis=0) + + bx = (xyz_29[:, 0] - 256 * uvd_29[:, 0] / f * xyz_29[:, 2]) * weight[:, 0] + by = (xyz_29[:, 1] - 256 * uvd_29[:, 1] / f * xyz_29[:, 2]) * weight[:, 0] + b = np.concatenate([bx, by], axis=0) + + A_s = np.dot(A.T, A) + b_s = np.dot(A.T, b) + + cam_para = np.linalg.solve(A_s, b_s) + + trans = cam_para[1:] + scale = 1.0 / cam_para[0] + + target_camera = np.zeros(3) + target_camera[0] = scale + target_camera[1:] = trans + + backed_projected_xyz = self.back_projection(uvd_29, target_camera, f) + backed_projected_xyz[:, 2] = backed_projected_xyz[:, 2] * self.depth_factor2meter + diff = np.sum((backed_projected_xyz - xyz_29)**2, axis=-1) * weight[:, 0] + diff = np.sqrt(diff).sum() / (weight.sum() + 1e-6) * 1000 # roughly mpjpe > 70 + # print(scale, trans, diff) + if diff < 70: + new_uvd = self.projection(xyz_29, target_camera, f) + return scale, trans, 1.0, diff, new_uvd * uvd_weight + else: + return scale, trans, 0.0, diff, new_uvd + + def projection(self, xyz, camera, f): + # xyz: unit: meter, u = f/256 * (x+dx) / (z+dz) + transl = camera[1:3] + scale = camera[0] + z_cam = xyz[:, 2:] + f / (256.0 * scale) # J x 1 + uvd = np.zeros_like(xyz) + uvd[:, 2] = xyz[:, 2] / self.bbox_3d_shape[2] + uvd[:, :2] = f / 256.0 * (xyz[:, :2] + transl) / z_cam + return uvd + + def back_projection(self, uvd, pred_camera, focal_length=5000.): + camScale = pred_camera[:1].reshape(1, -1) + camTrans = pred_camera[1:].reshape(1, -1) + + camDepth = focal_length / (256 * camScale) + + pred_xyz = np.zeros_like(uvd) + pred_xyz[:, 2] = uvd[:, 2].copy() + pred_xyz[:, :2] = (uvd[:, :2] * 256 / focal_length) * (pred_xyz[:, 2:] * self.depth_factor2meter + camDepth) - camTrans + + return pred_xyz + + +def _box_to_center_scale_nosquare(x, y, w, h, aspect_ratio=1.0, scale_mult=1.5): + """Convert box coordinates to center and scale. + adapted from https://github.com/Microsoft/human-pose-estimation.pytorch + """ + pixel_std = 1 + center = np.zeros((2), dtype=np.float32) + center[0] = x + w * 0.5 + center[1] = y + h * 0.5 + + scale = np.array( + [w * 1.0 / pixel_std, h * 1.0 / pixel_std], dtype=np.float32) + if center[0] != -1: + scale = scale * scale_mult + return center, scale diff --git a/hybrik/utils/presets/simple_transform_3d_smpl_cam.py b/hybrik/utils/presets/simple_transform_3d_smpl_cam.py index 9890be8..8f62dad 100644 --- a/hybrik/utils/presets/simple_transform_3d_smpl_cam.py +++ b/hybrik/utils/presets/simple_transform_3d_smpl_cam.py @@ -8,7 +8,7 @@ from ..bbox import _box_to_center_scale, _center_scale_to_box from ..transforms import (addDPG, affine_transform, flip_joints_3d, flip_thetas, flip_xyz_joints_3d, get_affine_transform, im_to_torch, batch_rodrigues_numpy, flip_twist, - rotmat_to_quat_numpy, rotate_xyz_jts, rot_aa, flip_cam_xyz_joints_3d) + rotate_xyz_jts, rot_aa, flip_cam_xyz_joints_3d) from ..pose_utils import get_intrinsic_metrix s_coco_2_smpl_jt = [ @@ -105,7 +105,7 @@ class SimpleTransform3DSMPLCam(object): def __init__(self, dataset, scale_factor, color_factor, occlusion, add_dpg, input_size, output_size, depth_dim, bbox_3d_shape, rot, sigma, train, loss_type='MSELoss', scale_mult=1.25, focal_length=1000, two_d=False, - root_idx=0, get_paf=False): + root_idx=0): if two_d: self._joint_pairs = dataset.joint_pairs else: @@ -140,9 +140,6 @@ def __init__(self, dataset, scale_factor, color_factor, occlusion, add_dpg, self.focal_length = focal_length self.root_idx = root_idx - self.get_paf = get_paf - # self.use_camera = use_camera - if train: self.num_joints_half_body = dataset.num_joints_half_body self.prob_half_body = dataset.prob_half_body @@ -454,7 +451,6 @@ def __call__(self, src, label): joint_cam_17_xyz = joint_cam_17 joints_cam_24_xyz = joint_cam_29[:24] - if random.random() > 0.75 and self._train: assert src.shape[2] == 3 src = src[:, ::-1, :] @@ -470,8 +466,8 @@ def __call__(self, src, label): # rotate global theta theta[0, :3] = rot_aa(theta[0, :3], r) - theta_rot_mat = batch_rodrigues_numpy(theta) - theta_quat = rotmat_to_quat_numpy(theta_rot_mat).reshape(24 * 4) + theta_rot_mat = batch_rodrigues_numpy(theta).reshape(24 * 9) + # theta_quat = rotmat_to_quat_numpy(theta_rot_mat).reshape(24 * 4) # rotate xyz joints joint_cam_17_xyz = rotate_xyz_jts(joint_cam_17_xyz, r) @@ -497,9 +493,10 @@ def __call__(self, src, label): joints_29_uvd[i, 0:2, 0] = affine_transform(joints_29_uvd[i, 0:2, 0], trans) target_smpl_weight = torch.ones(1).float() - theta_24_weights = np.ones((24, 4)) - - theta_24_weights = theta_24_weights.reshape(24 * 4) + # theta_24_weights = np.ones((24, 4)) + # theta_24_weights = theta_24_weights.reshape(24 * 4) + theta_24_weights = np.ones((24, 9)) + theta_24_weights = theta_24_weights.reshape(24 * 9) # generate training targets target_uvd_29, target_weight_29 = self._integral_uvd_target_generator(joints_29_uvd, 29, inp_h, inp_w) @@ -517,9 +514,9 @@ def __call__(self, src, label): if self.focal_length > 0: cam_scale, cam_trans, cam_valid, cam_error, new_uvd = self.calc_cam_scale_trans2( target_xyz_24.reshape(-1, 3).copy(), - tmp_uvd_24.copy(), + tmp_uvd_24.copy(), tmp_uvd_24_weight.copy()) - + target_uvd_29 = (target_uvd_29 * target_weight_29).reshape(-1, 3) else: cam_scale = 1 @@ -572,7 +569,8 @@ def __call__(self, src, label): output = { 'type': '3d_data_w_smpl', 'image': img, - 'target_theta': torch.from_numpy(theta_quat).float(), + # 'target_theta': torch.from_numpy(theta_quat).float(), + 'target_theta': torch.from_numpy(theta_rot_mat).float(), 'target_theta_weight': torch.from_numpy(theta_24_weights).float(), 'target_beta': torch.from_numpy(beta).float(), 'target_smpl_weight': target_smpl_weight, @@ -662,7 +660,7 @@ def calc_cam_scale_trans2(self, xyz_29, uvd_29, uvd_weight): # print('bad data') return 0, np.zeros(2), 0.0, -1, uvd_29 - xyz_29 = xyz_29 * self.depth_factor2meter # convert to meter + xyz_29 = xyz_29 * self.depth_factor2meter # convert to meter new_uvd = uvd_29.copy() num_joints = len(uvd_29)