Skip to content

Commit

Permalink
src: replace naive search in Buffer::IndexOf
Browse files Browse the repository at this point in the history
Adds the string search implementation from v8
which uses naive search if pattern length < 8
or to a specific badness then uses Boyer-Moore-Horspool

Added benchmark shows the expected improvements
Added option to use ucs2 encoding with Buffer::IndexOf
  • Loading branch information
skomski committed Sep 21, 2015
1 parent abb2a4b commit b2088ce
Show file tree
Hide file tree
Showing 8 changed files with 4,931 additions and 60 deletions.
38 changes: 38 additions & 0 deletions benchmark/buffers/buffer-indexof.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
var common = require('../common.js');
var fs = require('fs');

var bench = common.createBenchmark(main, {
search: ['@', 'SQ', '10x', '--l', 'Alice', 'Gryphon', 'Panther',
'Ou est ma chatte?', 'found it very', 'among mad people',
'neighbouring pool', 'Soo--oop', 'aaaaaaaaaaaaaaaaa',
'venture to go near the house till she had brought herself down to',
'</i> to the Caterpillar'],
encoding: ['undefined', 'utf8', 'ucs2', 'binary'],
type: ['buffer', 'string'],
iter: [1]
});

function main(conf) {
var iter = (conf.iter) * 100000;
var aliceBuffer = fs.readFileSync(__dirname + '/../fixtures/alice.html');
var search = conf.search;
var encoding = conf.encoding;

if (encoding === 'undefined') {
encoding = undefined;
}

if (encoding === 'ucs2') {
aliceBuffer = new Buffer(aliceBuffer.toString(), encoding);
}

if (conf.type === 'buffer') {
search = new Buffer(new Buffer(search).toString(), encoding);
}

bench.start();
for (var i = 0; i < iter; i++) {
aliceBuffer.indexOf(search, 0, encoding);
}
bench.end(iter);
}
3,867 changes: 3,867 additions & 0 deletions benchmark/fixtures/alice.html

Large diffs are not rendered by default.

45 changes: 39 additions & 6 deletions lib/buffer.js
Original file line number Diff line number Diff line change
Expand Up @@ -395,20 +395,53 @@ Buffer.prototype.compare = function compare(b) {
return binding.compare(this, b);
};

function slowIndexOf(buffer, val, byteOffset, encoding) {
var loweredCase = false;
for (;;) {
switch (encoding) {
case 'utf8':
case 'utf-8':
case 'ucs2':
case 'ucs-2':
case 'utf16le':
case 'utf-16le':
case 'binary':
return binding.indexOfString(buffer, val, byteOffset, encoding);

Buffer.prototype.indexOf = function indexOf(val, byteOffset) {
case 'base64':
case 'ascii':
case 'hex':
return binding.indexOfBuffer(
buffer, Buffer(val, encoding), byteOffset, encoding);

default:
if (loweredCase) {
throw new TypeError('Unknown encoding: ' + encoding);
}

encoding = ('' + encoding).toLowerCase();
loweredCase = true;
}
}
}

Buffer.prototype.indexOf = function indexOf(val, byteOffset, encoding) {
if (byteOffset > 0x7fffffff)
byteOffset = 0x7fffffff;
else if (byteOffset < -0x80000000)
byteOffset = -0x80000000;
byteOffset >>= 0;

if (typeof val === 'string')
return binding.indexOfString(this, val, byteOffset);
if (val instanceof Buffer)
return binding.indexOfBuffer(this, val, byteOffset);
if (typeof val === 'number')
if (typeof val === 'string') {
if (encoding === undefined) {
return binding.indexOfString(this, val, byteOffset, encoding);
}
return slowIndexOf(this, val, byteOffset, encoding);
} else if (val instanceof Buffer) {
return binding.indexOfBuffer(this, val, byteOffset, encoding);
} else if (typeof val === 'number') {
return binding.indexOfNumber(this, val, byteOffset);
}

throw new TypeError('val must be string, number or Buffer');
};
Expand Down
1 change: 1 addition & 0 deletions node.gyp
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@
'src/util.h',
'src/util-inl.h',
'src/util.cc',
'src/string_search.cc',
'deps/http_parser/http_parser.h',
'deps/v8/include/v8.h',
'deps/v8/include/v8-debug.h',
Expand Down
178 changes: 124 additions & 54 deletions src/node_buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "env.h"
#include "env-inl.h"
#include "string_bytes.h"
#include "string_search.h"
#include "util.h"
#include "util-inl.h"
#include "v8-profiler.h"
Expand Down Expand Up @@ -854,87 +855,156 @@ void Compare(const FunctionCallbackInfo<Value> &args) {
}


int32_t IndexOf(const char* haystack,
size_t h_length,
const char* needle,
size_t n_length) {
CHECK_GE(h_length, n_length);
// TODO(trevnorris): Implement Boyer-Moore string search algorithm.
for (size_t i = 0; i < h_length - n_length + 1; i++) {
if (haystack[i] == needle[0]) {
if (memcmp(haystack + i, needle, n_length) == 0)
return i;
}
}
return -1;
}


void IndexOfString(const FunctionCallbackInfo<Value>& args) {
ASSERT(args[1]->IsString());
ASSERT(args[2]->IsNumber());

enum encoding enc = ParseEncoding(args.GetIsolate(),
args[3],
UTF8);

THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]);
SPREAD_ARG(args[0], ts_obj);

node::Utf8Value str(args.GetIsolate(), args[1]);
int32_t offset_i32 = args[2]->Int32Value();
uint32_t offset;
Local<String> needle = args[1].As<String>();
const char* haystack = ts_obj_data;
const size_t haystack_length = ts_obj_length;
const size_t needle_length = needle->Utf8Length();


if (needle_length == 0 || haystack_length == 0) {
return args.GetReturnValue().Set(-1);
}

int64_t offset_i64 = args[2]->IntegerValue();
size_t offset = 0;

if (offset_i32 < 0) {
if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
if (offset_i64 < 0) {
if (offset_i64 + static_cast<int64_t>(haystack_length) < 0) {
offset = 0;
else
offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
} else {
offset = static_cast<size_t>(haystack_length + offset_i64);
}
} else {
offset = static_cast<uint32_t>(offset_i32);
offset = static_cast<size_t>(offset_i64);
}

if (str.length() == 0 ||
ts_obj_length == 0 ||
(offset != 0 && str.length() + offset <= str.length()) ||
str.length() + offset > ts_obj_length)
if (haystack_length < offset || needle_length + offset > haystack_length) {
return args.GetReturnValue().Set(-1);
}

int32_t r =
IndexOf(ts_obj_data + offset, ts_obj_length - offset, *str, str.length());
args.GetReturnValue().Set(r == -1 ? -1 : static_cast<int32_t>(r + offset));
}
size_t result = haystack_length;

if (enc == UCS2) {
String::Value needle_value(needle);
if (*needle_value == nullptr)
return args.GetReturnValue().Set(-1);

if (haystack_length < 2 || needle_value.length() < 1) {
return args.GetReturnValue().Set(-1);
}

result = SearchString(reinterpret_cast<const uint16_t*>(haystack),
haystack_length / 2,
reinterpret_cast<const uint16_t*>(*needle_value),
needle_value.length(),
offset / 2);
result *= 2;
} else if (enc == UTF8) {
String::Utf8Value needle_value(needle);
if (*needle_value == nullptr)
return args.GetReturnValue().Set(-1);

result = SearchString(reinterpret_cast<const uint8_t*>(haystack),
haystack_length,
reinterpret_cast<const uint8_t*>(*needle_value),
needle_length,
offset);
} else if (enc == BINARY) {
uint8_t* needle_data = static_cast<uint8_t*>(malloc(needle_length));
if (needle_data == nullptr) {
return args.GetReturnValue().Set(-1);
}
needle->WriteOneByte(
needle_data, 0, needle_length, String::NO_NULL_TERMINATION);

result = SearchString(reinterpret_cast<const uint8_t*>(haystack),
haystack_length,
needle_data,
needle_length,
offset);
free(needle_data);
}

args.GetReturnValue().Set(
result == haystack_length ? -1 : static_cast<int>(result));
}

void IndexOfBuffer(const FunctionCallbackInfo<Value>& args) {
ASSERT(args[1]->IsObject());
ASSERT(args[2]->IsNumber());

enum encoding enc = ParseEncoding(args.GetIsolate(),
args[3],
UTF8);

THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]);
SPREAD_ARG(args[0], ts_obj);
SPREAD_ARG(args[1], buf);
const int32_t offset_i32 = args[2]->Int32Value();
uint32_t offset;

if (buf_length > 0)
CHECK_NE(buf_data, nullptr);

if (offset_i32 < 0) {
if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
const char* haystack = ts_obj_data;
const size_t haystack_length = ts_obj_length;
const char* needle = buf_data;
const size_t needle_length = buf_length;

if (needle_length == 0 || haystack_length == 0) {
return args.GetReturnValue().Set(-1);
}

int64_t offset_i64 = args[2]->IntegerValue();
size_t offset = 0;

if (offset_i64 < 0) {
if (offset_i64 + static_cast<int64_t>(haystack_length) < 0)
offset = 0;
else
offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
offset = static_cast<size_t>(haystack_length + offset_i64);
} else {
offset = static_cast<uint32_t>(offset_i32);
offset = static_cast<size_t>(offset_i64);
}

if (buf_length == 0 ||
ts_obj_length == 0 ||
(offset != 0 && buf_length + offset <= buf_length) ||
buf_length + offset > ts_obj_length)
if (haystack_length < offset || needle_length + offset > haystack_length) {
return args.GetReturnValue().Set(-1);
}

int32_t r =
IndexOf(ts_obj_data + offset, ts_obj_length - offset, buf_data, buf_length);
args.GetReturnValue().Set(r == -1 ? -1 : static_cast<int32_t>(r + offset));
}
size_t result = haystack_length;

if (enc == UCS2) {
if (haystack_length < 2 || needle_length < 2) {
return args.GetReturnValue().Set(-1);
}
result = SearchString(
reinterpret_cast<const uint16_t*>(haystack),
haystack_length / 2,
reinterpret_cast<const uint16_t*>(needle),
needle_length / 2,
offset / 2);
result *= 2;
} else {
result = SearchString(
reinterpret_cast<const uint8_t*>(haystack),
haystack_length,
reinterpret_cast<const uint8_t*>(needle),
needle_length,
offset);
}

args.GetReturnValue().Set(
result == haystack_length ? -1 : static_cast<int>(result));
}

void IndexOfNumber(const FunctionCallbackInfo<Value>& args) {
ASSERT(args[1]->IsNumber());
Expand All @@ -944,25 +1014,25 @@ void IndexOfNumber(const FunctionCallbackInfo<Value>& args) {
SPREAD_ARG(args[0], ts_obj);

uint32_t needle = args[1]->Uint32Value();
int32_t offset_i32 = args[2]->Int32Value();
uint32_t offset;
int64_t offset_i64 = args[2]->IntegerValue();
size_t offset;

if (offset_i32 < 0) {
if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
if (offset_i64 < 0) {
if (offset_i64 + static_cast<int64_t>(ts_obj_length) < 0)
offset = 0;
else
offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
offset = static_cast<size_t>(ts_obj_length + offset_i64);
} else {
offset = static_cast<uint32_t>(offset_i32);
offset = static_cast<size_t>(offset_i64);
}

if (ts_obj_length == 0 || offset + 1 > ts_obj_length)
return args.GetReturnValue().Set(-1);

void* ptr = memchr(ts_obj_data + offset, needle, ts_obj_length - offset);
char* ptr_char = static_cast<char*>(ptr);
args.GetReturnValue().Set(
ptr ? static_cast<int32_t>(ptr_char - ts_obj_data) : -1);
args.GetReturnValue().Set(ptr ? static_cast<int>(ptr_char - ts_obj_data)
: -1);
}


Expand Down
10 changes: 10 additions & 0 deletions src/string_search.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#include "string_search.h"

namespace node {
namespace stringsearch {

int StringSearchBase::kBadCharShiftTable[kUC16AlphabetSize];
int StringSearchBase::kGoodSuffixShiftTable[kBMMaxShift + 1];
int StringSearchBase::kSuffixTable[kBMMaxShift + 1];
}
} // namespace node::stringsearch
Loading

0 comments on commit b2088ce

Please sign in to comment.