influxdata · jwilder · Nov 20, 2015 · Nov 12, 2015 · Nov 18, 2015 · Nov 19, 2015
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,7 @@ There are breaking changes in this release:
   - Scripts are now located in `/usr/lib/influxdb/scripts` (previously `/opt/influxdb`)
 
 ### Features
+- [#4841](https://github.com/influxdb/influxdb/pull/4841): Improve parsing of measurements and tags
 - [#4098](https://github.com/influxdb/influxdb/pull/4702): Support 'history' command at CLI
 - [#4098](https://github.com/influxdb/influxdb/issues/4098): Enable `golint` on the code base - uuid subpackage
 - [#4141](https://github.com/influxdb/influxdb/pull/4141): Control whether each query should be logged

diff --git a/models/points.go b/models/points.go
@@ -247,110 +247,18 @@ func scanKey(buf []byte, i int) (int, []byte, error) {
 	// we need to know how many values in the buffer are in use.
 	commas := 0
 
-	// tracks whether we've see an '='
-	equals := 0
-
-	// loop over each byte in buf
-	for {
-		// reached the end of buf?
-		if i >= len(buf) {
-			if equals == 0 && commas > 0 {
-				return i, buf[start:i], fmt.Errorf("missing tag value")
-			}
-
-			break
-		}
-
-		// equals is special in the tags section.  It must be escaped if part of a tag key or value.
-		// It does not need to be escaped if part of the measurement.
-		if buf[i] == '=' && commas > 0 {
-			if i-1 < 0 || i-2 < 0 {
-				return i, buf[start:i], fmt.Errorf("missing tag key")
-			}
-
-			// Check for "cpu,=value" but allow "cpu,a\,=value"
-			if buf[i-1] == ',' && buf[i-2] != '\\' {
-				return i, buf[start:i], fmt.Errorf("missing tag key")
-			}
-
-			// Check for "cpu,\ =value"
-			if buf[i-1] == ' ' && buf[i-2] != '\\' {
-				return i, buf[start:i], fmt.Errorf("missing tag key")
-			}
-
-			i += 1
-			equals += 1
-
-			// Check for "cpu,a=1,b= value=1" or "cpu,a=1,b=,c=foo value=1"
-			if i < len(buf) && (buf[i] == ' ' || buf[i] == ',') {
-				return i, buf[start:i], fmt.Errorf("missing tag value")
-			}
-			continue
-		}
-
-		// escaped character
-		if buf[i] == '\\' {
-			i += 2
-			continue
-		}
-
-		// At a tag separator (comma), track it's location
-		if buf[i] == ',' {
-			if equals == 0 && commas > 0 {
-				return i, buf[start:i], fmt.Errorf("missing tag value")
-			}
-			i += 1
-
-			// grow our indices slice if we have too many tags
-			if commas >= len(indices) {
-				newIndics := make([]int, cap(indices)*2)
-				copy(newIndics, indices)
-				indices = newIndics
-			}
-			indices[commas] = i
-			commas += 1
-
-			// Check for "cpu, value=1"
-			if i < len(buf) && buf[i] == ' ' {
-				return i, buf[start:i], fmt.Errorf("missing tag key")
-			}
-			continue
-		}
-
-		// reached end of the block? (next block would be fields)
-		if buf[i] == ' ' {
-			// check for "cpu,tag value=1"
-			if equals == 0 && commas > 0 {
-				return i, buf[start:i], fmt.Errorf("missing tag value")
-			}
-			if equals > 0 && commas-1 != equals-1 {
-				return i, buf[start:i], fmt.Errorf("missing tag value")
-			}
-
-			// grow our indices slice if we have too many tags
-			if commas >= len(indices) {
-				newIndics := make([]int, cap(indices)*2)
-				copy(newIndics, indices)
-				indices = newIndics
-			}
-
-			indices[commas] = i + 1
-			break
-		}
-
-		i += 1
-	}
-
-	// check that all field sections had key and values (e.g. prevent "a=1,b"
-	// We're using commas -1 because there should always be a comma after measurement
-	if equals > 0 && commas-1 != equals-1 {
-		return i, buf[start:i], fmt.Errorf("invalid tag format")
+	// First scan the Point's measurement.
+	state, i, err := scanMeasurement(buf, i)
+	if err != nil {
+		return i, buf[start:i], err
 	}
 
-	// This check makes sure we actually received fields from the user. #3379
-	// This will catch invalid syntax such as: `cpu,host=serverA,region=us-west`
-	if i >= len(buf) {
-		return i, buf[start:i], fmt.Errorf("missing fields")
+	// Optionally scan tags if needed.
+	if state == tagKeyState {
+		i, commas, indices, err = scanTags(buf, i, indices)
+		if err != nil {
+			return i, buf[start:i], err
+		}
 	}
 
 	// Now we know where the key region is within buf, and the locations of tags, we
@@ -402,6 +310,152 @@ func scanKey(buf []byte, i int) (int, []byte, error) {
 	return i, buf[start:i], nil
 }
 
+// The following constants allow us to specify which state to move to
+// next, when scanning sections of a Point.
+const (
+	tagKeyState = iota
+	tagValueState
+	fieldsState
+)
+
+// scanMeasurement examines the measurement part of a Point, returning
+// the next state to move to, and the current location in the buffer.
+func scanMeasurement(buf []byte, i int) (int, int, error) {
+	// Check first byte of measurement, anything except a comma is fine.
+	// It can't be a space, since whitespace is stripped prior to this
+	// function call.
+	if buf[i] == ',' {
+		return -1, i, fmt.Errorf("missing measurement")
+	}
+
+	for {
+		i++
+		if i >= len(buf) {
+			// cpu
+			return -1, i, fmt.Errorf("missing fields")
+		}
+
+		if buf[i-1] == '\\' {
+			// Skip character (it's escaped).
+			continue
+		}
+
+		// Unescaped comma; move onto scanning the tags.
+		if buf[i] == ',' {
+			return tagKeyState, i + 1, nil
+		}
+
+		// Unescaped space; move onto scanning the fields.
+		if buf[i] == ' ' {
+			// cpu value=1.0
+			return fieldsState, i, nil
+		}
+	}
+}
+
+// scanTags examines all the tags in a Point, keeping track of and
+// returning the updated indices slice, number of commas and location
+// in buf where to start examining the Point fields.
+func scanTags(buf []byte, i int, indices []int) (int, int, []int, error) {
+	var (
+		err    error
+		commas int
+		state  = tagKeyState
+	)
+
+	for {
+		switch state {
+		case tagKeyState:
+			// Grow our indices slice if we have too many tags.
+			if commas >= len(indices) {
+				newIndics := make([]int, cap(indices)*2)
+				copy(newIndics, indices)
+				indices = newIndics
+			}
+			indices[commas] = i
+			commas++
+
+			i, err = scanTagsKey(buf, i)
+			state = tagValueState // tag value always follows a tag key
+		case tagValueState:
+			state, i, err = scanTagsValue(buf, i)
+		case fieldsState:
+			indices[commas] = i + 1
+			return i, commas, indices, nil
+		}
+
+		if err != nil {
+			return i, commas, indices, err
+		}
+	}
+}
+
+// scanTagsKey scans each character in a tag key.
+func scanTagsKey(buf []byte, i int) (int, error) {
+	// First character of the key.
+	if i >= len(buf) || buf[i] == ' ' || buf[i] == ',' || buf[i] == '=' {
+		// cpu,{'', ' ', ',', '='}
+		return i, fmt.Errorf("missing tag key")
+	}
+
+	// Examine each character in the tag key until we hit an unescaped
+	// equals (the tag value), or we hit an error (i.e., unescaped
+	// space or comma).
+	for {
+		i++
+
+		// Either we reached the end of the buffer or we hit an
+		// unescaped comma or space.
+		if i >= len(buf) ||
+			((buf[i] == ' ' || buf[i] == ',') && buf[i-1] != '\\') {
+			// cpu,tag{'', ' ', ','}
+			return i, fmt.Errorf("missing tag value")
+		}
+
+		if buf[i] == '=' && buf[i-1] != '\\' {
+			// cpu,tag=
+			return i + 1, nil
+		}
+	}
+}
+
+// scanTagsValue scans each character in a tag value.
+func scanTagsValue(buf []byte, i int) (int, int, error) {
+	// Tag value cannot be empty.
+	if buf[i] == ',' || buf[i] == ' ' {
+		// cpu,tag={',', ' '}
+		return -1, i, fmt.Errorf("missing tag value")
+	}
+
+	// Examine each character in the tag value until we hit an unescaped
+	// comma (move onto next tag key), an unescaped space (move onto
+	// fields), or we error out.
+	for {
+		i++
+		if i >= len(buf) {
+			// cpu,tag=value
+			return -1, i, fmt.Errorf("missing fields")
+		}
+
+		// An unescaped equals sign is an invalid tag value.
+		if buf[i] == '=' && buf[i-1] != '\\' {
+			// cpu,tag={'=', 'fo=o'}
+			return -1, i, fmt.Errorf("invalid tag format")
+		}
+
+		if buf[i] == ',' && buf[i-1] != '\\' {
+			// cpu,tag=foo,
+			return tagKeyState, i + 1, nil
+		}
+
+		// cpu,tag=foo value=1.0
+		// cpu, tag=foo\= value=1.0
+		if buf[i] == ' ' && buf[i-1] != '\\' {
+			return fieldsState, i, nil
+		}
+	}
+}
+
 func insertionSort(l, r int, buf []byte, indices []int) {
 	for i := l + 1; i < r; i++ {
 		for j := i; j > l && less(buf, indices, j, j-1); j-- {
@@ -804,24 +858,26 @@ func scanTo(buf []byte, i int, stop byte) (int, []byte) {
 // spaces, they are skipped.
 func scanToSpaceOr(buf []byte, i int, stop byte) (int, []byte) {
 	start := i
+	if buf[i] == stop || buf[i] == ' ' {
+		return i, buf[start:i]
+	}
+
 	for {
+		i++
+		if buf[i-1] == '\\' {
+			continue
+		}
+
 		// reached the end of buf?
 		if i >= len(buf) {
-			break
+			return i, buf[start:i]
 		}
 
-		if buf[i] == '\\' {
-			i += 2
-			continue
-		}
 		// reached end of block?
 		if buf[i] == stop || buf[i] == ' ' {
-			break
+			return i, buf[start:i]
 		}
-		i += 1
 	}
-
-	return i, buf[start:i]
 }
 
 func scanTagValue(buf []byte, i int) (int, []byte) {