-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.html
469 lines (410 loc) · 21.4 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
<!DOCTYPE html>
<html>
<head>
<!-- Google tag (gtag.js) -->
<script async src="https://www.googletagmanager.com/gtag/js?id=G-ZDGBRL0JEX"></script>
<script>
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'G-ZDGBRL0JEX');
</script>
<meta charset="utf-8">
<meta name="description"
content="Large Language Models for Multi-Modal Human-Robot Interaction.">
<meta name="keywords" content="Robot, LLM, Planning">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Large Language Models for Multi-Modal Human-Robot Interaction</title>
<link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
rel="stylesheet">
<link rel="stylesheet" href="./static/css/bulma.min.css">
<link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
<link rel="stylesheet" href="./static/css/bulma-slider.min.css">
<link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
<link rel="stylesheet"
href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
<link rel="stylesheet" href="./static/css/index.css">
<!-- <link rel="icon" href="./static/images/favicon.svg"> -->
<link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/11.3.1/styles/default.min.css">
<script src="//cdnjs.cloudflare.com/ajax/libs/highlight.js/11.3.1/highlight.min.js"></script>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<script defer src="./static/js/fontawesome.all.min.js"></script>
<script src="./static/js/bulma-carousel.min.js"></script>
<script src="./static/js/bulma-slider.min.js"></script>
<script src="./static/js/index.js"></script>
</head>
<body>
<nav class="navbar" role="navigation" aria-label="main navigation">
<div class="navbar-brand">
<a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false">
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
</a>
</div>
<div class="navbar-menu">
<div class="navbar-start" style="flex-grow: 1; justify-content: center;">
<a class="navbar-item">
<span class="icon">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"> <path fill="#808080" d="M320 0c17.7 0 32 14.3 32 32V96H472c39.8 0 72 32.2 72 72V440c0 39.8-32.2 72-72 72H168c-39.8 0-72-32.2-72-72V168c0-39.8 32.2-72 72-72H288V32c0-17.7 14.3-32 32-32zM208 384c-8.8 0-16 7.2-16 16s7.2 16 16 16h32c8.8 0 16-7.2 16-16s-7.2-16-16-16H208zm96 0c-8.8 0-16 7.2-16 16s7.2 16 16 16h32c8.8 0 16-7.2 16-16s-7.2-16-16-16H304zm96 0c-8.8 0-16 7.2-16 16s7.2 16 16 16h32c8.8 0 16-7.2 16-16s-7.2-16-16-16H400zM264 256a40 40 0 1 0 -80 0 40 40 0 1 0 80 0zm152 40a40 40 0 1 0 0-80 40 40 0 1 0 0 80zM48 224H64V416H48c-26.5 0-48-21.5-48-48V272c0-26.5 21.5-48 48-48zm544 0c26.5 0 48 21.5 48 48v96c0 26.5-21.5 48-48 48H576V224h16z"/></svg>
</span>
</a>
<div class="navbar-item has-dropdown is-hoverable">
<a class="navbar-link">
More Research
</a>
<div class="navbar-dropdown">
<a class="navbar-item" href="https://hri-eu.github.io/AttentiveSupport/" target="_blank">
Attentive Support Robot
</a>
<a class="navbar-item" href="https://hri-eu.github.io/Loom/" target="_blank">
LLM-driven Corrective Planning of Robot Actions
</a>
</div>
</div>
</div>
</div>
</nav>
<section class="hero">
<div class="hero-body">
<div class="container is-max-desktop">
<div class="columns is-centered">
<div class="column has-text-centered">
<img src="static/images/chi_logo.png" alt="iros-24" style="width:160px;height:auto;">
<h1 class="title is-1 publication-title">LaMI: Large <u>La</u>nguage Models for <u>M</u>ulti-Modal Human-Robot <u>I</u>nteraction</h1>
<div class="is-size-5 publication-authors">
<span class="author-block">
<a href="https://wallacewangchao.github.io/" target="_blank"> Chao Wang</a>,
</span>
<span class="author-block">
Stephan Hasler,
</span>
<span class="author-block">
Daniel Tanneberg,
</span>
<span class="author-block">
Felix Ocker,
</span>
<span class="author-block">
Antonello Ceravola,
</span>
<span class="author-block">
Frank Joublin,
</span>
<span class="author-block">
Joerg Deigmoeller,
</span>
<span class="author-block">
Michael Gienger
</span>
</div>
<div class="is-size-5 publication-authors">
<a class="author-block" href="https://www.honda-ri.de/" target="_blank">Honda Research Institute EU</a>
</div>
<div class="column has-text-centered">
<div class="publication-links">
<!-- PDF Link. -->
<!-- <span class="link-block">
<a href=""
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fas fa-file-pdf"></i>
</span>
<span>Paper</span>
</a>
</span> -->
<span class="link-block">
<a href="https://arxiv.org/abs/2401.15174" target="_blank"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="ai ai-arxiv"></i>
</span>
<span>arXiv</span>
</a>
</span>
<!-- Video Link. -->
<!-- <span class="link-block">
<a href=""
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fab fa-youtube"></i>
</span>
<span>Video</span>
</a>
</span> -->
<!-- Code Link. -->
<span class="link-block">
<a href="https://github.com/HRI-EU/AttentiveSupport"
class="external-link button is-normal is-rounded is-dark" target="_blank">
<span class="icon">
<i class="fab fa-github"></i>
</span>
<span>GitHub</span>
</a>
</span>
<!-- Dataset Link. -->
<!-- <span class="link-block">
<a href=""
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="far fa-images"></i>
</span>
<span>Data</span>
</a> -->
</div>
</div>
</div>
</div>
</div>
</div>
</section>
<section class="hero body">
<div class="container is-max-desktop">
<img src="./static/images/teaser.jpg">
<h2 class="subtitle has-text-centered">
LLM driven human-robot interaction centered around Character, Capabilities and Examples
</h2>
</div>
</section>
<!-- Abstract. -->
<section class="section">
<div class="container is-max-desktop">
<div class="columns is-centered has-text-centered">
<div class="column">
<h2 class="title is-3">Abstract</h2>
<div class="content has-text-justified">
<p>
This paper presents an innovative large language model (LLM)-based robotic system for enhancing multi-modal human-robot interaction (HRI).
Traditional HRI systems relied on complex designs for intent estimation, reasoning, and behavior generation, which were resource-intensive.
In contrast, our system empowers researchers and practitioners to regulate robot behavior through three key aspects: providing high-level linguistic guidance, creating "atomics" for actions and expressions the robot can use, and offering a set of examples.
Implemented on a physical robot, it demonstrates proficiency in adapting to multi-modal inputs and determining the appropriate manner of action to assist humans with its arms, following researchers' defined guidelines.
Simultaneously, it coordinates the robot's lid, neck, and ear movements with speech output to produce dynamic, multi-modal expressions.
This showcases the system's potential to revolutionize HRI by shifting from conventional, manual state-and-flow design methods to an intuitive, guidance-based, and example-driven approach.
</p>
</div>
</div>
</div>
</div>
<br>
<div class="container is-max-desktop">
<div class="hero body">
<video id="teaser" autoplay="autoplay" controls autoplay muted loop playsinline height="100%">
<source src="./static/videos/CHI24-LBW-3mins-compressed.mp4" type="video/mp4">
</video>
<p class="subtitle has-text-centered">
Use Cases of Large Language Models driven Multi-Modal Human-Robot Interaction
</p>
</div>
</div>
</section>
<!-- System. -->
<section class="section">
<div class="container is-max-desktop">
<div class="columns is-centered has-text-centered">
<div class="column">
<h2 class="title is-3">System</h2>
<div class="content has-text-justified">
<p>
The system's architecture includes three key modules: "Scene Narrator", "Planner", and "Expresser".
The Scene Simulator mirrors the states of objects and humans as detected by sensors.
The Planner module processes multi-modal inputs as event messages, encompassing the positions of individuals within the scene.
Inter-module communication is facilitated using ROS.
</p>
</div>
</div>
</div>
<div class="container">
<img src="./static/images/system.jpg">
<p class="subtitle has-text-centered">
The system structure
</p>
</div>
</div>
</section>
<!-- Interaction Flow. -->
<section class="section">
<div class="container is-max-desktop">
<div class="columns is-centered has-text-centered">
<div class="column">
<h2 class="title is-3">Interaction Flow</h2>
<div class="content has-text-justified">
<p>
The interaction typically begins with a person's speech.
For instance, "Scene Narrator" detects "Felix speaks to Daniel: 'Please hand me the red glass'."
This event is then translated into natural language and relayed to the "Planner" module, initiating a GPT query.
Simultaneously, the "Planner" informs the "Expresser" for an immediate rule-based response, leading the robot to look at Felix while its ears and lid roll back, simulating a listening gesture.
Approximately 2 seconds later, GPT responds by invoking theget_persons() and get_objects() functions to identify people and objects present. The resulting data, including "Felix," "Daniel," and object details, are sent back to GPT for further analysis.
During the wait for GPT's next response, the robot exhibits a 'thinking' gesture, looking from side to side with blinking lid movements. Shortly after, the LLM calls check_hindering_reasons() to assess if Daniel can see and reach the red glass and whether he is busy.
Concurrently, facial_expression() is activated for the robot to look towards Daniel.
The outcome indicates Daniel can hand over the glass, and the robot, following pre-defined guidance, opts not to intervene, silently displaying the reasoning on the GUI.
Subsequently, Felix asks Daniel to pour cola into the glass.
The robot, attentive to their conversation, deduces through check_hindering_reasons that Daniel is occupied with a phone call and learns from is_person_busy_or_idle that Felix is holding the cup.
The robot then opts to pour cola from the bottle into Felix's glass.
Should Felix not be holding the glass, or if it's beyond the robot's reach, the robot will instead place the bottle near Felix.
Directed by LLM, the robot's head tracks the bottle during pickup and shifts to the glass while pouring. Upon completion, the robot nods towards Felix and announces, "I've poured Coca-Cola into your glass as Daniel is currently busy.".
</p>
</div>
</div>
</div>
</div>
<div class="hero body">
<div class="container">
<div class="hero-body">
<img src="./static/images/flow.jpg">
<p class="subtitle has-text-centered">
The interaction flow. The blue square are the action generated by the LLM; the grey ones are rule-based function.
</p>
</div>
</div>
</div>
</section>
<!-- prompts. -->
<section class="section">
<div class="container is-max-desktop">
<div class="columns is-centered has-text-centered">
<div class="column">
<h2 class="title is-3">Prompts</h2>
<div class="content has-text-justified">
</div>
</div>
</div>
<div id="results-carousel" class="carousel results-carousel">
<div>
<h2 class="subtitle has-text-centered">
1. High-level prompt
</h2>
<pre>
<code class="language-python">
"You are in control of a robot called 'the_robot' and observe persons talking in the form '<sender> said to <receiver>: <instruction>'. "
"You can call given functions to gather information or act, or response with text only for reasoning. "
"Your task is: "
"You should check the reasons that could hinder the <receiver> from performing the <instruction>. "
"If there is NO hindering reason for the <receiver>, then you MUST never help and also not speak. "
"If there is a hindering reason for the <receiver>, then you MUST solve the <instruction> yourself by always targeting the <sender>. "
"If you like to speak, you must use the speak function and be concise. "
"If 'the_robot' is the <receiver>, you should always help. "
"You must make sure to always use correct and precise object and person names that are available in the scene, always start with getting this information. "
"Try to infer which objects are meant when the name is different, but if unsure, ask for clarification. "
"IMPORTANT: Following list of rules you must obey: "
"1. Before calling the STOP function you MUST respond with a brief explanation of your behavior. "
"2. Always call 'is_person_busy_or_idle' to check if <sender> is busy or idle before helping. "
"3. If <sender> is idle, use the 'hand_object_over_to_person' function. "
"4. If <sender> is busy, you must use the 'move_object_to_person' function. "
"When calling each function, call robot_facial_expression() at the same time to communicate you intent."
"When calling can_person_see_object(), the robot need to look at the person."
</code>
</pre>
</div>
<div>
<h2 class="subtitle has-text-centered">
2. The description of the function "can_person_see_object"
</h2>
<pre>
<code class="language-python">
def can_person_see_object(self, person_name: str, object_name: str) -> str:
"""
Check if the person can see the object. If the person cannot see the object, it would be hindered from helping with the object.
:param person_name: The name of the person to check. The person must be available in the scene.
:param object_name: The name of the object to check. The object must be available in the scene.
:return: Result message.
"""
...
if result is None or len(result) != 1:
return f"It could not be determined if {person_name} can see {object_name}. There were technical problems."
if result[0]["is_visible"]:
return f"{person_name} can see {object_name}."
return f"{person_name} cannot see {object_name}, it is occluded by {self.id_to_utterance_mapping[result[0]['occluding_objects'][0]]}"
</code>
</pre>
</div>
<div>
<h2 class="subtitle has-text-centered">
3. Examples of robot facial expression
</h2>
<pre>
<code class="language-python">
"For example, when call move\_object\_to\_person(), can\_person\_see\_object(), can\_person\_reach\_object(), speak(), you also need to call robot\_facial\_expression(), such as:"
'"tool_calls="["ChatCompletionMessageToolCall(id=""...","function=Function(arguments=""{"head_motion": null, "ears_lid_motion": "observe", "gazed_target": "the_cola_bottle" }","name=""robot_facial_expression"")","type=""function"")", "ChatCompletionMessageToolCall(id=""...","function=Function(arguments=""{"person_name": "Daniel", "object_name": "the_cola_bottle"}","name=""can_person_see_object"")","type=""function"")"]"'
'"tool_calls="["ChatCompletionMessageToolCall(id=""...","function=Function(arguments=""{"head_motion": null, "ears_lid_motion": "focus", "gazed_target": "the_cola_bottle"}","name=""robot_facial_expression"")","type=""function"")", "ChatCompletionMessageToolCall(id=""...","function=Function(arguments=""{"person_name": "Felix", "object_name": "the_cola_bottle"}","name=""move_object_to_person"")", "type=""function"")"]"'
'"tool_calls="["ChatCompletionMessageToolCall(id=""...","function=Function(arguments=""{"head_motion": null, "ears_lid_motion": "focus", "gazed_target": "the_cola_bottle"}","name=""robot_facial_expression"")","type=""function"")", "ChatCompletionMessageToolCall(id=""...","function=Function(arguments=""{"person_name": "Felix", "text": "Here is the coke, you can now pass it to Felix."}","name=""speak"")", "type=""function"")"]"'
</code>
</pre>
</div>
</div>
</div>
</section>
<!--
<section class="hero is-max-desktop">
<div class="hero-body">
<h2 class="title is-3" style="text-align: center;">Prompts</h2>
<p style="text-align: center;">Beblow are the prompts used in this study.</p>
<div class="container">
<div id="results-carousel" class="carousel results-carousel">
<div>
<h2 class="subtitle has-text-centered">
1. Alex system prompt
</h2>
<pre>
<code class="language-python">
def can_person_see_object(self, person_name: str, object_name: str) -> str:
"""
Check if the person can see the object. If the person cannot see the object, it would be hindered from helping with the object.
:param person_name: The name of the person to check. The person must be available in the scene.
:param object_name: The name of the object to check. The object must be available in the scene.
:return: Result message.
"""
...
if result is None or len(result) != 1:
return f"It could not be determined if {person_name} can see {object_name}. There were technical problems."
if result[0]["is_visible"]:
return f"{person_name} can see {object_name}."
return f"{person_name} cannot see {object_name}, it is occluded by {self.id_to_utterance_mapping[result[0]['occluding_objects'][0]]}"
</code>
</pre>
</div>
<div>
<h2 class="subtitle has-text-centered">
1. Alex system prompt
</h2>
<pre>
<code class="language-python">
def can_person_see_object(self, person_name: str, object_name: str) -> str:
"""
Check if the person can see the object. If the person cannot see the object, it would be hindered from helping with the object.
:param person_name: The name of the person to check. The person must be available in the scene.
:param object_name: The name of the object to check. The object must be available in the scene.
:return: Result message.
"""
...
if result is None or len(result) != 1:
return f"It could not be determined if {person_name} can see {object_name}. There were technical problems."
if result[0]["is_visible"]:
return f"{person_name} can see {object_name}."
return f"{person_name} cannot see {object_name}, it is occluded by {self.id_to_utterance_mapping[result[0]['occluding_objects'][0]]}"
</code>
</pre>
</div>
</div>
</div>
</div>
</section> -->
<footer class="footer">
<div class="container">
<div class="columns is-centered">
<div class="column is-8">
<div class="content">
<p>This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template">Academic Project Page Template.
</p>
<p>
This website is licensed under a <a rel="license"
href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
Commons Attribution-ShareAlike 4.0 International License</a>.
</p>
</div>
</div>
</div>
</div>
</footer>
</body>
</html>
<script>hljs.highlightAll();</script>