Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix parsing issues in models.scanKey #4841

Merged
merged 8 commits into from
Nov 20, 2015
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ There are breaking changes in this release:
- Scripts are now located in `/usr/lib/influxdb/scripts` (previously `/opt/influxdb`)

### Features
- [#4841](https://github.com/influxdb/influxdb/pull/4841): Improve parsing of measurements and tags
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we be a little bit more specific here? The user-visible improvement would be less CPU consumption?, is that right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, yes. I wrote that before I made the performance improvements. Not added to the CHANGELOG before, how about:

Improve point parsing speed

Also, I just realised I put it under the features section when it started off as a bug fix. Should I move it or leave it where it is?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have a strong opinion, it's neither one or other.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK I'll leave it where it is. Are you happy with:

Improve point parsing speed

Any better suggestions?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fine with me -- thanks. You can mention you linted the code as well.

- [#4098](https://github.com/influxdb/influxdb/pull/4702): Support 'history' command at CLI
- [#4098](https://github.com/influxdb/influxdb/issues/4098): Enable `golint` on the code base - uuid subpackage
- [#4141](https://github.com/influxdb/influxdb/pull/4141): Control whether each query should be logged
Expand Down
278 changes: 167 additions & 111 deletions models/points.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,110 +247,18 @@ func scanKey(buf []byte, i int) (int, []byte, error) {
// we need to know how many values in the buffer are in use.
commas := 0

// tracks whether we've see an '='
equals := 0

// loop over each byte in buf
for {
// reached the end of buf?
if i >= len(buf) {
if equals == 0 && commas > 0 {
return i, buf[start:i], fmt.Errorf("missing tag value")
}

break
}

// equals is special in the tags section. It must be escaped if part of a tag key or value.
// It does not need to be escaped if part of the measurement.
if buf[i] == '=' && commas > 0 {
if i-1 < 0 || i-2 < 0 {
return i, buf[start:i], fmt.Errorf("missing tag key")
}

// Check for "cpu,=value" but allow "cpu,a\,=value"
if buf[i-1] == ',' && buf[i-2] != '\\' {
return i, buf[start:i], fmt.Errorf("missing tag key")
}

// Check for "cpu,\ =value"
if buf[i-1] == ' ' && buf[i-2] != '\\' {
return i, buf[start:i], fmt.Errorf("missing tag key")
}

i += 1
equals += 1

// Check for "cpu,a=1,b= value=1" or "cpu,a=1,b=,c=foo value=1"
if i < len(buf) && (buf[i] == ' ' || buf[i] == ',') {
return i, buf[start:i], fmt.Errorf("missing tag value")
}
continue
}

// escaped character
if buf[i] == '\\' {
i += 2
continue
}

// At a tag separator (comma), track it's location
if buf[i] == ',' {
if equals == 0 && commas > 0 {
return i, buf[start:i], fmt.Errorf("missing tag value")
}
i += 1

// grow our indices slice if we have too many tags
if commas >= len(indices) {
newIndics := make([]int, cap(indices)*2)
copy(newIndics, indices)
indices = newIndics
}
indices[commas] = i
commas += 1

// Check for "cpu, value=1"
if i < len(buf) && buf[i] == ' ' {
return i, buf[start:i], fmt.Errorf("missing tag key")
}
continue
}

// reached end of the block? (next block would be fields)
if buf[i] == ' ' {
// check for "cpu,tag value=1"
if equals == 0 && commas > 0 {
return i, buf[start:i], fmt.Errorf("missing tag value")
}
if equals > 0 && commas-1 != equals-1 {
return i, buf[start:i], fmt.Errorf("missing tag value")
}

// grow our indices slice if we have too many tags
if commas >= len(indices) {
newIndics := make([]int, cap(indices)*2)
copy(newIndics, indices)
indices = newIndics
}

indices[commas] = i + 1
break
}

i += 1
}

// check that all field sections had key and values (e.g. prevent "a=1,b"
// We're using commas -1 because there should always be a comma after measurement
if equals > 0 && commas-1 != equals-1 {
return i, buf[start:i], fmt.Errorf("invalid tag format")
// First scan the Point's measurement.
state, i, err := scanMeasurement(buf, i)
if err != nil {
return i, buf[start:i], err
}

// This check makes sure we actually received fields from the user. #3379
// This will catch invalid syntax such as: `cpu,host=serverA,region=us-west`
if i >= len(buf) {
return i, buf[start:i], fmt.Errorf("missing fields")
// Optionally scan tags if needed.
if state == tagKeyState {
i, commas, indices, err = scanTags(buf, i, indices)
if err != nil {
return i, buf[start:i], err
}
}

// Now we know where the key region is within buf, and the locations of tags, we
Expand Down Expand Up @@ -402,6 +310,152 @@ func scanKey(buf []byte, i int) (int, []byte, error) {
return i, buf[start:i], nil
}

// The following constants allow us to specify which state to move to
// next, when scanning sections of a Point.
const (
tagKeyState = iota
tagValueState
fieldsState
)

// scanMeasurement examines the measurement part of a Point, returning
// the next state to move to, and the current location in the buffer.
func scanMeasurement(buf []byte, i int) (int, int, error) {
// Check first byte of measurement, anything except a comma is fine.
// It can't be a space, since whitespace is stripped prior to this
// function call.
if buf[i] == ',' {
return -1, i, fmt.Errorf("missing measurement")
}

for {
i++
if i >= len(buf) {
// cpu
return -1, i, fmt.Errorf("missing fields")
}

if buf[i-1] == '\\' {
// Skip character (it's escaped).
continue
}

// Unescaped comma; move onto scanning the tags.
if buf[i] == ',' {
return tagKeyState, i + 1, nil
}

// Unescaped space; move onto scanning the fields.
if buf[i] == ' ' {
// cpu value=1.0
return fieldsState, i, nil
}
}
}

// scanTags examines all the tags in a Point, keeping track of and
// returning the updated indices slice, number of commas and location
// in buf where to start examining the Point fields.
func scanTags(buf []byte, i int, indices []int) (int, int, []int, error) {
var (
err error
commas int
state = tagKeyState
)

for {
switch state {
case tagKeyState:
// Grow our indices slice if we have too many tags.
if commas >= len(indices) {
newIndics := make([]int, cap(indices)*2)
copy(newIndics, indices)
indices = newIndics
}
indices[commas] = i
commas++

i, err = scanTagsKey(buf, i)
state = tagValueState // tag value always follows a tag key
case tagValueState:
state, i, err = scanTagsValue(buf, i)
case fieldsState:
indices[commas] = i + 1
return i, commas, indices, nil
}

if err != nil {
return i, commas, indices, err
}
}
}

// scanTagsKey scans each character in a tag key.
func scanTagsKey(buf []byte, i int) (int, error) {
// First character of the key.
if i >= len(buf) || buf[i] == ' ' || buf[i] == ',' || buf[i] == '=' {
// cpu,{'', ' ', ',', '='}
return i, fmt.Errorf("missing tag key")
}

// Examine each character in the tag key until we hit an unescaped
// equals (the tag value), or we hit an error (i.e., unescaped
// space or comma).
for {
i++

// Either we reached the end of the buffer or we hit an
// unescaped comma or space.
if i >= len(buf) ||
((buf[i] == ' ' || buf[i] == ',') && buf[i-1] != '\\') {
// cpu,tag{'', ' ', ','}
return i, fmt.Errorf("missing tag value")
}

if buf[i] == '=' && buf[i-1] != '\\' {
// cpu,tag=
return i + 1, nil
}
}
}

// scanTagsValue scans each character in a tag value.
func scanTagsValue(buf []byte, i int) (int, int, error) {
// Tag value cannot be empty.
if buf[i] == ',' || buf[i] == ' ' {
// cpu,tag={',', ' '}
return -1, i, fmt.Errorf("missing tag value")
}

// Examine each character in the tag value until we hit an unescaped
// comma (move onto next tag key), an unescaped space (move onto
// fields), or we error out.
for {
i++
if i >= len(buf) {
// cpu,tag=value
return -1, i, fmt.Errorf("missing fields")
}

// An unescaped equals sign is an invalid tag value.
if buf[i] == '=' && buf[i-1] != '\\' {
// cpu,tag={'=', 'fo=o'}
return -1, i, fmt.Errorf("invalid tag format")
}

if buf[i] == ',' && buf[i-1] != '\\' {
// cpu,tag=foo,
return tagKeyState, i + 1, nil
}

// cpu,tag=foo value=1.0
// cpu, tag=foo\= value=1.0
if buf[i] == ' ' && buf[i-1] != '\\' {
return fieldsState, i, nil
}
}
}

func insertionSort(l, r int, buf []byte, indices []int) {
for i := l + 1; i < r; i++ {
for j := i; j > l && less(buf, indices, j, j-1); j-- {
Expand Down Expand Up @@ -804,24 +858,26 @@ func scanTo(buf []byte, i int, stop byte) (int, []byte) {
// spaces, they are skipped.
func scanToSpaceOr(buf []byte, i int, stop byte) (int, []byte) {
start := i
if buf[i] == stop || buf[i] == ' ' {
return i, buf[start:i]
}

for {
i++
if buf[i-1] == '\\' {
continue
}

// reached the end of buf?
if i >= len(buf) {
break
return i, buf[start:i]
}

if buf[i] == '\\' {
i += 2
continue
}
// reached end of block?
if buf[i] == stop || buf[i] == ' ' {
break
return i, buf[start:i]
}
i += 1
}

return i, buf[start:i]
}

func scanTagValue(buf []byte, i int) (int, []byte) {
Expand Down
Loading