TASK
DATASET
MODEL
METRIC NAME
METRIC VALUE
GLOBAL RANK
EXTRA DATA
REMOVE
Zero-Shot Video Retrieval
ActivityNet
LanguageBind(ViT-L/14)
text-to-video R@1
38.4
# 6
Zero-Shot Video Retrieval
ActivityNet
LanguageBind(ViT-L/14)
video-to-text R@1
35.7
# 6
Zero-Shot Video Retrieval
ActivityNet
LanguageBind(ViT-L/14)
text-to-video R@10
77.9
# 7
Zero-Shot Video Retrieval
ActivityNet
LanguageBind(ViT-L/14)
text-to-video R@5
66.6
# 7
Zero-Shot Video Retrieval
ActivityNet
LanguageBind(ViT-L/14)
video-to-text R@5
65.8
# 6
Zero-Shot Video Retrieval
ActivityNet
LanguageBind(ViT-L/14)
video-to-text R@10
77.8
# 6
Zero-Shot Video Retrieval
ActivityNet
LanguageBind(ViT-H/14)
text-to-video R@1
41.0
# 5
Zero-Shot Video Retrieval
ActivityNet
LanguageBind(ViT-H/14)
video-to-text R@1
39.1
# 5
Zero-Shot Video Retrieval
ActivityNet
LanguageBind(ViT-H/14)
text-to-video R@10
80.0
# 3
Zero-Shot Video Retrieval
ActivityNet
LanguageBind(ViT-H/14)
text-to-video R@5
68.4
# 5
Zero-Shot Video Retrieval
ActivityNet
LanguageBind(ViT-H/14)
video-to-text R@5
69.8
# 3
Zero-Shot Video Retrieval
ActivityNet
LanguageBind(ViT-H/14)
video-to-text R@10
81.1
# 3
Zero-shot Text to Audio Retrieval
AudioCaps
LanguageBind(FT)
R@10
67.6
# 2
Zero-shot Text to Audio Retrieval
AudioCaps
LanguageBind(FT)
Audio-to-text R@1
19.7
# 3
Zero-shot Text to Audio Retrieval
AudioCaps
LanguageBind(LoRA)
R@10
53.2
# 3
Zero-shot Text to Audio Retrieval
AudioCaps
LanguageBind(LoRA)
Audio-to-text R@1
12.2
# 4
Zero-shot Audio Classification
AudioSet
LanguageBind(FT)
Test mAP
30.0
# 1
Zero-shot Audio Classification
AudioSet
LanguageBind(LoRA)
Test mAP
27.7
# 2
Zero-shot Text to Audio Retrieval
Clotho
LanguageBind(FT)
text-to-audio R@1
16.7
# 2
Zero-shot Text to Audio Retrieval
Clotho
LanguageBind(FT)
text-to-audio R@10
52.0
# 1
Zero-shot Text to Audio Retrieval
Clotho
LanguageBind(LoRA)
text-to-audio R@1
12.1
# 4
Zero-shot Text to Audio Retrieval
Clotho
LanguageBind(LoRA)
text-to-audio R@10
44.0
# 3
Zero-Shot Video Retrieval
DiDeMo
LanguageBind(ViT-L/14)
text-to-video R@1
39.7
# 9
Zero-Shot Video Retrieval
DiDeMo
LanguageBind(ViT-L/14)
text-to-video R@5
65.5
# 9
Zero-Shot Video Retrieval
DiDeMo
LanguageBind(ViT-L/14)
text-to-video R@10
73.8
# 9
Zero-Shot Video Retrieval
DiDeMo
LanguageBind(ViT-L/14)
video-to-text R@1
38.4
# 6
Zero-Shot Video Retrieval
DiDeMo
LanguageBind(ViT-L/14)
text-to-video Median Rank
2.0
# 1
Zero-Shot Video Retrieval
DiDeMo
LanguageBind(ViT-L/14)
video-to-text R@5
66.6
# 6
Zero-Shot Video Retrieval
DiDeMo
LanguageBind(ViT-L/14)
video-to-text R@10
77.9
# 5
Zero-Shot Video Retrieval
DiDeMo
LanguageBind(ViT-H/14)
text-to-video R@1
39.9
# 8
Zero-Shot Video Retrieval
DiDeMo
LanguageBind(ViT-H/14)
text-to-video R@5
66.1
# 8
Zero-Shot Video Retrieval
DiDeMo
LanguageBind(ViT-H/14)
text-to-video R@10
74.6
# 8
Zero-Shot Video Retrieval
DiDeMo
LanguageBind(ViT-H/14)
video-to-text R@1
39.8
# 5
Zero-Shot Video Retrieval
DiDeMo
LanguageBind(ViT-H/14)
text-to-video Median Rank
2
# 1
Zero-Shot Video Retrieval
DiDeMo
LanguageBind(ViT-H/14)
video-to-text R@5
67.8
# 5
Zero-Shot Video Retrieval
DiDeMo
LanguageBind(ViT-H/14)
video-to-text R@10
76.2
# 6
Zero-Shot Environment Sound Classification
ESC-50
LanguageBind(LoRA)
Accuracy
91.8
# 3
Zero-Shot Environment Sound Classification
ESC-50
LanguageBind(FT)
Accuracy
94.0
# 2
Zero-Shot Action Recognition
Kinetics
LanguageBind
Top-1 Accuracy
64.1
# 9
Zero-Shot Action Recognition
Kinetics
LanguageBind
Top-5 Accuracy
85.7
# 6
Zero-shot Classification (unified classes)
LLVIP
LanguageBind
Balanced Accuracy
87.2
# 1
Zero-Shot Video Retrieval
MSR-VTT
LanguageBind(ViT-L/14)
text-to-video R@1
42.8
# 6
Zero-Shot Video Retrieval
MSR-VTT
LanguageBind(ViT-L/14)
text-to-video R@5
67.5
# 6
Zero-Shot Video Retrieval
MSR-VTT
LanguageBind(ViT-L/14)
text-to-video R@10
76.0
# 5
Zero-Shot Video Retrieval
MSR-VTT
LanguageBind(ViT-L/14)
video-to-text R@1
38.3
# 6
Zero-Shot Video Retrieval
MSR-VTT
LanguageBind(ViT-L/14)
text-to-video Median Rank
2.0
# 1
Zero-Shot Video Retrieval
MSR-VTT
LanguageBind(ViT-L/14)
video-to-text R@5
65.8
# 4
Zero-Shot Video Retrieval
MSR-VTT
LanguageBind(ViT-L/14)
video-to-text R@10
77.8
# 3
Zero-Shot Video Retrieval
MSR-VTT
LanguageBind(ViT-L/14)
video-to-text Median Rank
3.0
# 2
Zero-Shot Video Retrieval
MSR-VTT
LanguageBind(ViT-H/14)
text-to-video R@1
44.8
# 5
Zero-Shot Video Retrieval
MSR-VTT
LanguageBind(ViT-H/14)
text-to-video R@5
70.0
# 3
Zero-Shot Video Retrieval
MSR-VTT
LanguageBind(ViT-H/14)
text-to-video R@10
78.7
# 4
Zero-Shot Video Retrieval
MSR-VTT
LanguageBind(ViT-H/14)
video-to-text R@1
40.9
# 3
Zero-Shot Video Retrieval
MSR-VTT
LanguageBind(ViT-H/14)
text-to-video Median Rank
2
# 1
Zero-Shot Video Retrieval
MSR-VTT
LanguageBind(ViT-H/14)
video-to-text R@5
66.4
# 3
Zero-Shot Video Retrieval
MSR-VTT
LanguageBind(ViT-H/14)
video-to-text R@10
75.7
# 4
Zero-Shot Video Retrieval
MSR-VTT
LanguageBind(ViT-H/14)
video-to-text Median Rank
2.
# 1
Zero-Shot Video Retrieval
MSVD
LanguageBind(ViT-L/14)
text-to-video R@1
54.1
# 3
Zero-Shot Video Retrieval
MSVD
LanguageBind(ViT-L/14)
video-to-text R@1
69.7
# 6
Zero-Shot Video Retrieval
MSVD
LanguageBind(ViT-L/14)
text-to-video R@5
81.1
# 3
Zero-Shot Video Retrieval
MSVD
LanguageBind(ViT-L/14)
text-to-video R@10
88.1
# 3
Zero-Shot Video Retrieval
MSVD
LanguageBind(ViT-L/14)
video-to-text R@5
91.8
# 3
Zero-Shot Video Retrieval
MSVD
LanguageBind(ViT-L/14)
video-to-text R@10
97.9
# 1
Zero-Shot Video Retrieval
MSVD
LanguageBind(ViT-L/14)
text-to-video Median Rank
1.0
# 1
Zero-Shot Video Retrieval
MSVD
LanguageBind(ViT-L/14)
video-to-text Median Rank
1.0
# 1
Zero-Shot Video Retrieval
MSVD
LanguageBind(ViT-H/14)
text-to-video R@1
53.9
# 4
Zero-Shot Video Retrieval
MSVD
LanguageBind(ViT-H/14)
video-to-text R@1
72.0
# 5
Zero-Shot Video Retrieval
MSVD
LanguageBind(ViT-H/14)
text-to-video R@5
80.4
# 4
Zero-Shot Video Retrieval
MSVD
LanguageBind(ViT-H/14)
text-to-video R@10
87.8
# 4
Zero-Shot Video Retrieval
MSVD
LanguageBind(ViT-H/14)
video-to-text R@5
91.4
# 4
Zero-Shot Video Retrieval
MSVD
LanguageBind(ViT-H/14)
video-to-text R@10
96.3
# 4
Zero-Shot Video Retrieval
MSVD
LanguageBind(ViT-H/14)
text-to-video Median Rank
1
# 1
Zero-Shot Video Retrieval
MSVD
LanguageBind(ViT-H/14)
video-to-text Median Rank
1
# 1
Zero-shot Scene Classification (unified classes)
NYU Depth v2
LanguageBind
Balanced Accuracy
65.1
# 1
Zero-shot Audio Classification
VGG-Sound
LanguageBind(LoRA)
Acc@1
28.9
# 3
Zero-shot Audio Classification
VGG-Sound
LanguageBind(FT)
Acc@1
38.6
# 1