-
Notifications
You must be signed in to change notification settings - Fork 32
/
udf.h
703 lines (599 loc) · 27.8 KB
/
udf.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef IMPALA_UDF_UDF_H
#define IMPALA_UDF_UDF_H
// THIS FILE IS USED BY THE STANDALONE IMPALA UDF DEVELOPMENT KIT.
// IT MUST BE BUILDABLE WITH C++98 AND WITHOUT ANY INTERNAL IMPALA HEADERS.
#include <assert.h>
#include <boost/cstdint.hpp>
#include <string.h>
// Only use noexcept if the compiler supports C++11 (some system compilers may not
// or may have it disabled by default).
#if __cplusplus >= 201103L
#define NOEXCEPT noexcept
#else
#define NOEXCEPT
#endif
/// This is the only Impala header required to develop UDFs and UDAs. This header
/// contains the types that need to be used and the FunctionContext object. The context
/// object serves as the interface object between the UDF/UDA and the impala process.
namespace impala {
class FunctionContextImpl;
}
namespace impala_udf {
/// All input and output values will be one of the structs below. The struct is a simple
/// object containing a boolean to store if the value is NULL and the value itself. The
/// value is unspecified if the NULL boolean is set.
struct AnyVal;
struct BooleanVal;
struct TinyIntVal;
struct SmallIntVal;
struct IntVal;
struct BigIntVal;
struct StringVal;
struct TimestampVal;
/// A FunctionContext is passed to every UDF/UDA and is the interface for the UDF to the
/// rest of the system. It contains APIs to examine the system state, report errors and
/// manage memory.
class FunctionContext {
public:
enum ImpalaVersion {
v1_2,
v1_3,
};
enum Type {
INVALID_TYPE = 0,
TYPE_NULL,
TYPE_BOOLEAN,
TYPE_TINYINT,
TYPE_SMALLINT,
TYPE_INT,
TYPE_BIGINT,
TYPE_FLOAT,
TYPE_DOUBLE,
TYPE_TIMESTAMP,
TYPE_STRING,
// Not used - maps to CHAR(N), which is not supported for UDFs and UDAs.
TYPE_FIXED_BUFFER,
TYPE_DECIMAL,
TYPE_VARCHAR,
// A fixed-size buffer, passed as a StringVal.
TYPE_FIXED_UDA_INTERMEDIATE
};
struct TypeDesc {
Type type;
/// Only valid if type == TYPE_DECIMAL
int precision;
int scale;
/// Only valid if type is one of TYPE_FIXED_BUFFER, TYPE_FIXED_UDA_INTERMEDIATE or
/// TYPE_VARCHAR.
int len;
};
struct UniqueId {
int64_t hi;
int64_t lo;
};
enum FunctionStateScope {
/// Indicates that the function state for this FunctionContext's UDF is shared across
/// the plan fragment (a query is divided into multiple plan fragments, each of which
/// is responsible for a part of the query execution). Within the plan fragment, there
/// may be multiple instances of the UDF executing concurrently with multiple
/// FunctionContexts sharing this state, meaning that the state must be
/// thread-safe. The Prepare() function for the UDF may be called with this scope
/// concurrently on a single host if the UDF will be evaluated in multiple plan
/// fragments on that host. In general, read-only state that doesn't need to be
/// recomputed for every UDF call should be fragment-local.
/// TODO: Move FRAGMENT_LOCAL states to query_state for multi-threading.
FRAGMENT_LOCAL,
/// Indicates that the function state is local to the execution thread. This state
/// does not need to be thread-safe. However, this state will be initialized (via the
/// Prepare() function) once for every execution thread, so fragment-local state
/// should be used when possible for better performance. In general, inexpensive
/// shared state that is written to by the UDF (e.g. scratch space) should be
/// thread-local.
THREAD_LOCAL,
};
/// Returns the version of Impala that's currently running.
ImpalaVersion version() const;
/// Returns the user that is running the query. Returns NULL if it is not
/// available.
const char* user() const;
/// Returns the effective user for authorization purposes. If a delegated user is
/// configured, returns that user, otherwise returns the same as user().
const char* effective_user() const;
/// Returns the query_id for the current query.
UniqueId query_id() const;
/// Sets an error for this UDF. If this is called, this will trigger the
/// query to fail.
void SetError(const char* error_msg);
/// Adds a warning that is returned to the user. This can include things like
/// overflow or other recoverable error conditions.
/// Warnings are capped at a maximum number. Returns true if the warning was
/// added and false if it was ignored due to the cap.
bool AddWarning(const char* warning_msg);
/// Returns true if there's been an error set.
bool has_error() const;
/// Returns the current error message. Returns NULL if there is no error.
const char* error_msg() const;
/// Allocates memory. All UDF/UDAs should use this if possible instead of malloc/new.
/// The UDF/UDA is responsible for calling Free() on all buffers returned by Allocate().
/// If Allocate() fails or causes the memory limit to be exceeded, the error will be
/// set in this object causing the query to fail.
/// TODO: 'byte_size' should be 64-bit. See IMPALA-2756.
uint8_t* Allocate(int byte_size) NOEXCEPT;
/// Wrapper around Allocate() to allocate a buffer of the given type "T".
template<typename T>
T* Allocate() {
return reinterpret_cast<T*>(Allocate(sizeof(T)));
}
/// Reallocates 'ptr' to the new byte_size. If the currently underlying allocation
/// is big enough, the original ptr will be returned. If the allocation needs to
/// grow, a new allocation is made that is at least 'byte_size' and the contents
/// of 'ptr' will be copied into it. If the new allocation fails or causes the
/// memory limit to be exceeded, the error will be set in this object.
///
/// This should be used for buffers that constantly get appended to.
/// TODO: 'byte_size' should be 64-bit. See IMPALA-2756.
uint8_t* Reallocate(uint8_t* ptr, int byte_size) NOEXCEPT;
/// Frees a buffer returned from Allocate() or Reallocate()
void Free(uint8_t* buffer) NOEXCEPT;
/// For allocations that cannot use the Allocate() API provided by this
/// object, TrackAllocation()/Free() can be used to just keep count of the
/// byte sizes. For each call to TrackAllocation(), the UDF/UDA must call
/// the corresponding Free().
void TrackAllocation(int64_t byte_size);
void Free(int64_t byte_size);
/// Methods for maintaining state across UDF/UDA function calls. SetFunctionState() can
/// be used to store a pointer that can then be retreived via GetFunctionState(). If
/// GetFunctionState() is called when no pointer is set, it will return
/// NULL. SetFunctionState() does not take ownership of 'ptr'; it is up to the UDF/UDA
/// to clean up any function state if necessary.
void SetFunctionState(FunctionStateScope scope, void* ptr);
void* GetFunctionState(FunctionStateScope scope) const;
/// Returns the return type information of this function. For UDAs, this is the final
/// return type of the UDA (e.g., the type returned by the finalize function).
const TypeDesc& GetReturnType() const;
/// Returns the intermediate type for UDAs, i.e., the one returned by
/// update and merge functions. Returns INVALID_TYPE for UDFs.
const TypeDesc& GetIntermediateType() const;
/// Returns the number of arguments to this function (not including the FunctionContext*
/// argument or the output of a UDA).
/// For UDAs, returns the number of logical arguments of the aggregate function, not
/// the number of arguments of the C++ function being executed.
int GetNumArgs() const;
/// Returns the type information for the arg_idx-th argument (0-indexed, not including
/// the FunctionContext* argument). Returns NULL if arg_idx is invalid.
/// For UDAs, returns the logical argument types of the aggregate function, not the
/// argument types of the C++ function being executed.
const TypeDesc* GetArgType(int arg_idx) const;
/// Returns true if the arg_idx-th input argument (indexed in the same way as
/// GetArgType()) is a constant (e.g. 5, "string", 1 + 1).
bool IsArgConstant(int arg_idx) const;
/// Returns a pointer to the value of the arg_idx-th input argument (indexed in the
/// same way as GetArgType()). Returns NULL if the argument is not constant. This
/// function can be used to obtain user-specified constants in a UDF's Init() or
/// Close() functions.
AnyVal* GetConstantArg(int arg_idx) const;
/// TODO: Do we need to add arbitrary key/value metadata. This would be plumbed
/// through the query. E.g. "select UDA(col, 'sample=true') from tbl".
/// const char* GetMetadata(const char*) const;
/// TODO: Add mechanism for UDAs to update stats similar to runtime profile counters
/// TODO: Add mechanism to query for table/column stats
/// Returns the underlying opaque implementation object. The UDF/UDA should not
/// use this. This is used internally.
impala::FunctionContextImpl* impl() const { return impl_; }
~FunctionContext();
private:
friend class impala::FunctionContextImpl;
FunctionContext();
/// Disable copy ctor and assignment operator
FunctionContext(const FunctionContext& other);
FunctionContext& operator=(const FunctionContext& other);
impala::FunctionContextImpl* impl_; // Owned by this object.
};
//----------------------------------------------------------------------------
//------------------------------- UDFs ---------------------------------------
//----------------------------------------------------------------------------
/// The UDF must implement this function prototype. This is not a typedef as the actual
/// UDF's signature varies from UDF to UDF.
/// typedef <*Val> Evaluate(FunctionContext* context, <const Val& arg>);
///
/// The UDF must return one of the *Val structs. The UDF must accept a pointer to a
/// FunctionContext object and then a const reference for each of the input arguments.
/// Examples of valid Udf signatures are:
/// 1) DoubleVal Example1(FunctionContext* context);
/// 2) IntVal Example2(FunctionContext* context, const IntVal& a1, const DoubleVal& a2);
///
/// UDFs can be variadic. The variable arguments must all come at the end and must be
/// the same type. A example signature is:
/// StringVal Concat(FunctionContext* context, const StringVal& separator,
/// int num_var_args, const StringVal* args);
/// In this case args[0] is the first variable argument and args[num_var_args - 1] is
/// the last.
///
/// ------- Memory Management -------
/// ---------------------------------
/// The UDF can assume that memory from input arguments will have the same lifetime as
/// results for the UDF. In other words, the UDF can return memory from input arguments
/// without making copies. For example, a function like substring will not need to
/// allocate and copy the smaller string.
///
/// Any state needed across calls must be stored and accessed via
/// FunctionContext::SetFunctionState() and FunctionContext::GetFunctionState(). The UDF
/// should not maintain any other state across calls since there is no guarantee on how
/// the execution is multithreaded or distributed.
///
/// For StringVal return values, the UDF can use StringVal(FunctionContext*, int)
/// ctor or the function StringVal::CopyFrom(FunctionContext*, const uint8_t*, size_t).
/// The memory consumed by the StringVal will be managed by Impala. Please see the UDA
/// section below for details.
///
/// -------- Execution Model --------
/// ---------------------------------
/// Execution model: For each UDF use occurring in a given query, at least one
/// FunctionContext will be created. For a given FunctionContext, the UDF's functions are
/// never called concurrently and therefore do not need to be thread-safe. State shared
/// across UDF invocations should be initialized and cleaned up using prepare and close
/// functions (described below).
///
/// Note that a single UDF use may produce multiple FunctionContexts for that UDF (this is
/// so the UDF can be executed concurrently in different threads). For example, the query
/// "select * from tbl where my_udf(x) > 0" may produce multiple FunctionContexts for
/// 'my_udf', each of which may concurrently be passed to 'my_udf's prepare, close, and
/// UDF functions.
///
/// --- Prepare / Close Functions ---
/// ---------------------------------
/// The UDF can optionally include a prepare function, specified in the "CREATE FUNCTION"
/// statement using "prepare_fn=<prepare function symbol>". The prepare function is called
/// before any calls to the UDF to evaluate values. This is the appropriate time for the
/// UDF to initialize any shared data structures, validate versions, etc. If there is an
/// error, this function should call FunctionContext::SetError()/
/// FunctionContext::AddWarning().
///
/// The prepare function is called multiple times with different FunctionStateScopes. It
/// will be called once per fragment with 'scope' set to FRAGMENT_LOCAL, and once per
/// execution thread with 'scope' set to THREAD_LOCAL.
typedef void (*UdfPrepare)(FunctionContext* context,
FunctionContext::FunctionStateScope scope);
/// The UDF can also optionally include a close function, specified in the "CREATE
/// FUNCTION" statement using "close_fn=<close function symbol>". The close function is
/// called after all calls to the UDF have completed. This is the appropriate time for the
/// UDF to deallocate any shared data structures that are not needed to maintain the
/// results. If there is an error, this function should call FunctionContext::SetError()/
/// FunctionContext::AddWarning().
//
/// The close function is called multiple times with different FunctionStateScopes. It
/// will be called once per fragment with 'scope' set to FRAGMENT_LOCAL, and once per
/// execution thread with 'scope' set to THREAD_LOCAL.
typedef void (*UdfClose)(FunctionContext* context,
FunctionContext::FunctionStateScope scope);
//----------------------------------------------------------------------------
//------------------------------- UDAs ---------------------------------------
//----------------------------------------------------------------------------
/// The UDA execution is broken up into a few steps. The general calling pattern
/// is one of these:
/// 1) Init(), Update() (repeatedly), Serialize()
/// 2) Init(), Update() (repeatedly), Finalize()
/// 3) Init(), Merge() (repeatedly), Serialize()
/// 4) Init(), Merge() (repeatedly), Finalize()
/// The UDA is registered with three types: the result type, the input type and
/// the intermediate type.
///
/// If the UDA needs a variable-sized buffer, it should use TYPE_STRING and allocate it
/// from the FunctionContext manually.
/// For UDAs that need a complex data structure as the intermediate state, the
/// intermediate type should be string and the UDA can cast the ptr to the structure
/// it is using.
///
/// Memory Management: allocations that are referred to by the intermediate values
/// returned by Init(), Update() and Merge() must be allocated via
/// FunctionContext::Allocate() and freed via FunctionContext::Free(). Both Serialize()
/// and Finalize() are responsible for cleaning up the intermediate value and freeing
/// such allocations. StringVals returned to Impala directly by Serialize(), Finalize()
/// or GetValue() should be backed by temporary results memory allocated via the
/// StringVal(FunctionContext*, int) ctor, StringVal::CopyFrom(FunctionContext*,
/// const uint8_t*, size_t), or StringVal::Resize().
///
/// Note that in the rare case the StringVal ctor or StringVal::CopyFrom() fail to
/// allocate memory, the StringVal object will be marked as a null string.
/// Serialize()/Finalize() should handle allocation failures by checking the is_null
/// field of the StringVal object and carry out appropriate error handling action.
/// Similarly, FunctionContext::Allocate()/Reallocate() may also fail to allocate
/// memory so callers should check the returned values before using them.
///
/// For clarity in documenting the UDA interface, the various types will be typedefed
/// here. The actual execution resolves all the types at runtime and none of these types
/// should actually be used.
///
/// TODO: add an Init() variant that takes the initial input value to avoid initializing
/// then immediately overwriting the value.
typedef AnyVal InputType;
typedef AnyVal InputType2;
typedef AnyVal ResultType;
typedef AnyVal IntermediateType;
/// UdaInit is called once for each aggregate group before calls to any of the
/// other functions below.
typedef void (*UdaInit)(FunctionContext* context, IntermediateType* result);
/// This is called for each input value. The UDA should update result based on the
/// input value. The update function can take any number of input arguments. Here
/// are some examples:
typedef void (*UdaUpdate)(FunctionContext* context, const InputType& input,
IntermediateType* result);
typedef void (*UdaUpdate2)(FunctionContext* context, const InputType& input,
const InputType2& input2, IntermediateType* result);
/// Merge an intermediate result 'src' into 'dst'.
typedef void (*UdaMerge)(FunctionContext* context, const IntermediateType& src,
IntermediateType* dst);
/// Serialize the intermediate type. The serialized data is then sent across the
/// wire.
/// No additional functions will be called with this FunctionContext object and the
/// UDA should do final clean (e.g. Free()) here.
typedef const IntermediateType (*UdaSerialize)(FunctionContext* context,
const IntermediateType& type);
/// Called once at the end to return the final value for this UDA.
/// No additional functions will be called with this FunctionContext object and the
/// UDA should do final clean (e.g. Free()) here.
typedef ResultType (*UdaFinalize)(FunctionContext* context, const IntermediateType& v);
//----------------------------------------------------------------------------
//-------------Implementation of the *Val structs ----------------------------
//----------------------------------------------------------------------------
struct AnyVal {
// Whether this value is NULL. If true, all other fields contain arbitrary values.
// UDF code should *not* assume that other fields of a NULL *Val struct have any
// particular value (e.g. 0 or -1).
bool is_null;
AnyVal(bool is_null = false) : is_null(is_null) {}
};
struct BooleanVal : public AnyVal {
bool val;
BooleanVal(bool val = false) : val(val) {}
static BooleanVal null() {
BooleanVal result;
result.is_null = true;
return result;
}
bool operator==(const BooleanVal& other) const {
if (is_null && other.is_null) return true;
if (is_null || other.is_null) return false;
return val == other.val;
}
bool operator!=(const BooleanVal& other) const { return !(*this == other); }
};
struct TinyIntVal : public AnyVal {
typedef int8_t underlying_type_t;
underlying_type_t val;
TinyIntVal(underlying_type_t val = 0) : val(val) { }
static TinyIntVal null() {
TinyIntVal result;
result.is_null = true;
return result;
}
bool operator==(const TinyIntVal& other) const {
if (is_null && other.is_null) return true;
if (is_null || other.is_null) return false;
return val == other.val;
}
bool operator!=(const TinyIntVal& other) const { return !(*this == other); }
};
struct SmallIntVal : public AnyVal {
typedef int16_t underlying_type_t;
underlying_type_t val;
SmallIntVal(underlying_type_t val = 0) : val(val) { }
static SmallIntVal null() {
SmallIntVal result;
result.is_null = true;
return result;
}
bool operator==(const SmallIntVal& other) const {
if (is_null && other.is_null) return true;
if (is_null || other.is_null) return false;
return val == other.val;
}
bool operator!=(const SmallIntVal& other) const { return !(*this == other); }
};
struct IntVal : public AnyVal {
typedef int32_t underlying_type_t;
underlying_type_t val;
IntVal(underlying_type_t val = 0) : val(val) { }
static IntVal null() {
IntVal result;
result.is_null = true;
return result;
}
bool operator==(const IntVal& other) const {
if (is_null && other.is_null) return true;
if (is_null || other.is_null) return false;
return val == other.val;
}
bool operator!=(const IntVal& other) const { return !(*this == other); }
};
struct BigIntVal : public AnyVal {
typedef int64_t underlying_type_t;
underlying_type_t val;
BigIntVal(underlying_type_t val = 0) : val(val) { }
static BigIntVal null() {
BigIntVal result;
result.is_null = true;
return result;
}
bool operator==(const BigIntVal& other) const {
if (is_null && other.is_null) return true;
if (is_null || other.is_null) return false;
return val == other.val;
}
bool operator!=(const BigIntVal& other) const { return !(*this == other); }
};
struct FloatVal : public AnyVal {
float val;
FloatVal(float val = 0) : val(val) { }
static FloatVal null() {
FloatVal result;
result.is_null = true;
return result;
}
bool operator==(const FloatVal& other) const {
return is_null == other.is_null && val == other.val;
}
bool operator!=(const FloatVal& other) const { return !(*this == other); }
};
struct DoubleVal : public AnyVal {
double val;
DoubleVal(double val = 0) : val(val) { }
static DoubleVal null() {
DoubleVal result;
result.is_null = true;
return result;
}
bool operator==(const DoubleVal& other) const {
if (is_null && other.is_null) return true;
if (is_null || other.is_null) return false;
return val == other.val;
}
bool operator!=(const DoubleVal& other) const { return !(*this == other); }
};
/// This object has a compatible storage format with boost::ptime.
struct TimestampVal : public AnyVal {
/// Gregorian date. This has the same binary format as boost::gregorian::date.
int32_t date;
/// Nanoseconds in current day.
int64_t time_of_day;
TimestampVal(int32_t date = 0, int64_t time_of_day = 0) :
date(date), time_of_day(time_of_day) {
}
static TimestampVal null() {
TimestampVal result;
result.is_null = true;
return result;
}
bool operator==(const TimestampVal& other) const {
if (is_null && other.is_null) return true;
if (is_null || other.is_null) return false;
return date == other.date && time_of_day == other.time_of_day;
}
bool operator!=(const TimestampVal& other) const { return !(*this == other); }
};
/// A String value represented as a buffer + length.
/// Note: there is a difference between a NULL string (is_null == true) and an
/// empty string (len == 0).
struct StringVal : public AnyVal {
// It's important to keep this as unsigned to avoid comparing with negative number
// in case of overflow.
static const unsigned MAX_LENGTH = (1 << 30);
// The length of the string buffer in bytes.
int len;
// Pointer to the start of the string buffer. The buffer is not aligned and is not
// null-terminated. Functions must not read or write past the end of the buffer.
// I.e. accessing ptr[i] where i >= len is invalid.
uint8_t* ptr;
/// Construct a StringVal from ptr/len. Note: this does not make a copy of ptr
/// so the buffer must exist as long as this StringVal does.
StringVal(uint8_t* ptr = NULL, int len = 0) : len(len), ptr(ptr) {
assert(len >= 0);
if (ptr == NULL) assert(len == 0);
}
/// Construct a StringVal from NULL-terminated c-string. Note: this does not make a
/// copy of ptr so the underlying string must exist as long as this StringVal does.
StringVal(const char* ptr) : len(strlen(ptr)), ptr((uint8_t*)ptr) {}
/// Creates a StringVal, allocating a new buffer with 'len'. This should
/// be used to return StringVal objects in UDF/UDAs that need to allocate new
/// string memory.
///
/// If the memory allocation fails, e.g. because the intermediate value would be too
/// large, the constructor will construct a NULL string and set an error on the function
/// context.
///
/// The memory backing this StringVal is managed by the Impala runtime and so doesn't need
/// to be explicitly freed.
StringVal(FunctionContext* context, int len) NOEXCEPT;
/// Resize a string value to 'len'. If 'len' is the same as or smaller than the current
/// length, truncates the string. Otherwise, increases the string's length, allocating
/// new memory and copying over the current contents if needed. The content of the new
/// space is undefined. If a resize fails, the length and contents of the StringVal are
/// unchanged.
///
/// Resized strings can be returned from UDFs as the result value. Callers do not
/// otherwise need to be concerned with backing storage, which is managed by the
/// Impala runtime and freed at some point after the UDF returns.
///
/// Returns true on success, false on failure.
bool Resize(FunctionContext* context, int len) NOEXCEPT;
/// Will create a new StringVal with the given dimension and copy the data from the
/// parameters. In case of an error will return a NULL string and set an error on the
/// function context.
///
/// Note that the memory for the buffer of the new StringVal is managed by Impala.
/// Impala will handle freeing it. Callers should not call Free() on the 'ptr' of
/// the StringVal returned.
static StringVal CopyFrom(FunctionContext* ctx, const uint8_t* buf, size_t len)
NOEXCEPT;
static StringVal null() {
StringVal sv;
sv.is_null = true;
return sv;
}
bool operator==(const StringVal& other) const {
if (is_null != other.is_null) return false;
if (is_null) return true;
if (len != other.len) return false;
return ptr == other.ptr || memcmp(ptr, other.ptr, len) == 0;
}
bool operator!=(const StringVal& other) const { return !(*this == other); }
};
struct DecimalVal : public impala_udf::AnyVal {
/// Decimal data is stored as an unscaled integer value. For example, the decimal 1.00
/// (precision 3, scale 2) is stored as 100. The byte size necessary to store the
/// decimal depends on the precision, which determines which field of the union should
/// be used to store and manipulate the unscaled value.
///
/// precision between 0-9: val4 (4 bytes)
/// precision between 10-18: val8 (8 bytes)
/// precision between 19-38: val16 (16 bytes)
///
/// While it is always safe to use a larger field than necessary, it may result in worse
/// performance. For example, a UDF that only uses val16 can handle any precision but
/// may be slower than one that uses val4 or val8. This is because the least-significant
/// bits of all three union fields are the same (assuming a little-endian architecture).
union {
int32_t val4;
int64_t val8;
__int128_t val16;
};
DecimalVal() : val16(0) {}
DecimalVal(int32_t v) : val16(v) {}
DecimalVal(int64_t v) : val16(v) {}
DecimalVal(__int128_t v) : val16(v) {}
static DecimalVal null() {
DecimalVal result;
result.is_null = true;
return result;
}
DecimalVal& operator=(const DecimalVal& other) {
// Depending on the compiler, the default assignment operator may require 16-byte
// alignment of 'this' and 'other'. Cast to void* so the compiler doesn't change back
// to an assignment.
memcpy(reinterpret_cast<void*>(this), reinterpret_cast<const void*>(&other),
sizeof(DecimalVal));
return *this;
}
DecimalVal(const DecimalVal& other) {
*this = other;
}
};
typedef uint8_t* BufferVal;
}
#endif