-
Notifications
You must be signed in to change notification settings - Fork 2
/
config.yaml
executable file
·136 lines (115 loc) · 2.15 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
data_config:
clip_word_length: 25
clip_word_stride: 30
tokenizer: clip
tokenize_max_length: &token_length 77
hidden_state_max_length: 20
summary_model: sshleifer/distilbart-cnn-12-6
summary_max_length: 30
summary_min_length: 20
image_size: &image_size
- 160
- 256
image_mean:
- 0.3331
- 0.3245
- 0.3051
image_std:
- 0.2439
- 0.2493
- 0.2873
clip_frame_num: 80
video_clip_length: 16
test_ratio: 0.00625
val_ratio: 0.00625
vit_config:
pretrained_resolution : 224
image_resolution: *image_size
patch_size : 16
width : 768
layers : 12
heads : 12
output_dim : 512
gpt_config:
embed_dim : 512
context_length : *token_length
vocab_size : 49408
layers : 12
width : 512
heads : 8
adapter_config:
video_adapter_layers: 2
text_adapter_layers: 2
feature_dim: 512
video_adapter_config:
adapter_layers: 2
feature_dim: 512
text_adapter_config:
adapter_layers: 0
feature_dim: 512
motion_adapter_config:
adapter_layers: 2
feature_dim: 512
action_adapter_config:
adapter_layers: 2
feature_dim: 512
temporal_encoder_config:
input_dim: 512
embed_dim:
depth: 2
num_heads: 8
max_seq_len: 32 # 80
ff_glu: True
ff_swish: True
attn_one_kv_head: False
rel_pos_bias: False
fuse_encoder_config:
input_dim: 512
embed_dim:
depth: 2
num_heads: 8
max_seq_len: 32
ff_glu: True
ff_swish: True
attn_one_kv_head: False
rel_pos_bias: False
text_encoder_config:
input_dim: 512
embed_dim:
depth: 2
num_heads: 8
max_seq_len: 80
ff_glu: True
ff_swish: True
attn_one_kv_head: False
rel_pos_bias: False
difference_encoder_config:
input_dim: 512
embed_dim:
depth: 2
num_heads: 8
max_seq_len: 32
ff_glu: True
ff_swish: True
attn_one_kv_head: False
rel_pos_bias: False
temporal_difference_encoder_config:
input_dim: 512
embed_dim:
depth: 2
num_heads: 8
max_seq_len: 32
ff_glu: True
ff_swish: True
attn_one_kv_head: False
rel_pos_bias: False
text_action_encoder_config:
input_dim: 512
embed_dim:
depth: 2
num_heads: 8
max_seq_len: 32
ff_glu: True
ff_swish: True
attn_one_kv_head: False
rel_pos_bias: False