- Python
- Pytorch
- torchvision
- Full dependencies
- Build and install CRF:
- Install Swig
- CRF (not used in this work, but it is part of the code.)
cdir=$(pwd)
cd dlib/crf/crfwrapper/bilateralfilter
swig -python -c++ bilateralfilter.i
python setup.py install
cd $cdir
cd dlib/crf/crfwrapper/colorbilateralfilter
swig -python -c++ colorbilateralfilter.i
python setup.py install
See full requirements at ./dependencies/requirements.txt
To prepare the dataset for this project, follow the steps below:
- Refer to the WSOL Evaluation Repository for detailed instructions on dataset preparation.
- For guidance on metadata preparation, refer here.
- To download the dataset, follow the instructions provided here.
Generate pseudo-labels using any CAM-based method. For this work, we specifically employ CLIP-ES and save the generated Grad-CAMs training.
Download pre-trained models for CLIP and ViT-EVA, and place them in the pretrained_clip
directory with the following structure:
pretrained_clip/ViT-B-16.pt
pretrained_clip/ViT-L-14-336px.pt
Ensure the following parameters are correctly set:
metadata_root
: Specify the path to the metadata generated in Step 3 (Dataset Preparation).data_root
: Specify the path to the dataset root directory.
cudaid=0
export CUDA_VISIBLE_DEVICES=$cudaid
getfreeport() {
freeport=$(python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')
}
export OMP_NUM_THREADS=50
export NCCL_BLOCKING_WAIT=1
getfreeport
torchrun --nnodes=1 --node_rank=0 --nproc_per_node=1 --master_port=$freeport main.py --local_world_size=1 \
main.py \
--local_world_size=4 \
--resize_size 256 \
--crop_size 256 \
--aug_ran_erase_input_img True \
--path_pre_trained 'pre-tained-weights-to-start-with' \
--pseduo_cams_dir_trainset 'path-to-precompted-cams-for pseduo-labels' \
--class_anchors_cache_path 'class_anchors_cache/ILSVRC_anchor_cache_q_by_qr_decom.pt' \
--encoder_name VIT-EVA \
--arch VIT-EVA \
--task CLIPCAM_DISTILL_TXTENCOD \
--opt__name_optimizer sgd \
--batch_size 8 \
--max_epochs 10 \
--freeze_cl False \
--support_background False \
--method CAM \
--dataset ILSVRC \
--box_v2_metric False \
--cudaid 0 \
--debug_subfolder DEBUG \
--num_workers 1 \
--spatial_pooling WGAP \
--opt__lr 0.0001 \
--opt__gamma 0.4 \
--opt__step_size 10 \
--binary_cl_patches_lambda 1 \
--patch_level_contrastive_lambda 1 \
--cl_w_patches_lambda 1 \
--exp_id example_exp \
--dist_backend nccl \
--sl_clc_use_roi True \
--sl_clc_min 100 \
--sl_clc_max 250 \
--sl_clc_max_p 0.6 \
--sl_clc_min_p 0.6 \
--sl_clc_roi_method largest \
--amp True