Skip to content

Commit

Permalink
Start work on moving Parser to Utf8
Browse files Browse the repository at this point in the history
  • Loading branch information
ThadHouse committed Jan 24, 2024
1 parent e7ae9e2 commit f83d86f
Show file tree
Hide file tree
Showing 7 changed files with 124 additions and 120 deletions.
96 changes: 33 additions & 63 deletions src/wpiutil/Serialization/Struct/Parsing/Lexer.cs
Original file line number Diff line number Diff line change
@@ -1,24 +1,31 @@
using System;
using System.Buffers;
using System.Text;

namespace WPIUtil.Serialization.Struct.Parsing;

public ref struct Lexer(ReadOnlySpan<char> inStr)
public ref struct Lexer(ReadOnlySpan<byte> inStr)
{
private ReadOnlySpan<char> m_in = inStr;
private char m_current;
private int m_tokenStart;
private int m_pos;
private Utf8CodePointEnumerator m_enumerator = new(inStr);

public int Position => m_enumerator.CurrentMark;

public TokenKind Scan()
{
// skip whitespace
do
{
Get();
} while (m_current == ' ' || m_current == '\t' || m_current == '\n');
m_tokenStart = m_pos - 1;
bool hasMoreData;
do {
hasMoreData = m_enumerator.MoveNext();
} while (hasMoreData && Rune.IsWhiteSpace(m_enumerator.Current));

if (!hasMoreData) {
// String has nothing left
return TokenKind.Unknown;
}

switch (m_current)
m_enumerator.Mark();

switch (m_enumerator.Current.Value)
{
case '[':
return TokenKind.LeftBracket;
Expand Down Expand Up @@ -51,75 +58,38 @@ public TokenKind Scan()
case '\0':
return TokenKind.EndOfInput;
default:
if (char.IsLetter(m_current) || m_current == '_')
if (Rune.IsLetter(m_enumerator.Current) || m_enumerator.Current.Value == '_')
{
return ScanIdentifier();
}
return TokenKind.Unknown;
}
}

public readonly ReadOnlySpan<char> GetTokenText()
public readonly ReadOnlySpan<byte> GetTokenText()
{
if (m_tokenStart >= m_in.Length)
{
return "";
}
return m_in[m_tokenStart..m_pos];
return m_enumerator.MarkedSpan;
}

public readonly int Position => m_tokenStart;

private TokenKind ScanInteger()
{
do
{
Get();
} while (char.IsDigit(m_current));
Unget();
bool hasMoreData;
do {
hasMoreData = m_enumerator.MoveNext();
} while (hasMoreData && Rune.IsDigit(m_enumerator.Current));

m_enumerator.MovePrevious();
return TokenKind.Integer;
}

private TokenKind ScanIdentifier()
{
do
{
Get();
} while (char.IsLetterOrDigit(m_current) || m_current == '_');
Unget();
return TokenKind.Identifier;
}

private void Get()
{
if (m_pos < m_in.Length)
{
m_current = m_in[m_pos];
}
else
{
m_current = '\0';
}
m_pos++;
}
bool hasMoreData;
do {
hasMoreData = m_enumerator.MoveNext();
} while (hasMoreData && (Rune.IsLetterOrDigit(m_enumerator.Current) || m_enumerator.Current.Value == '_'));

private void Unget()
{
if (m_pos > 0)
{
m_pos--;
if (m_pos < m_in.Length)
{
m_current = m_in[m_pos];
}
else
{
m_current = '\0';
}
}
else
{
m_current = '\0';
}
m_enumerator.MovePrevious();
return TokenKind.Identifier;
}
}
27 changes: 14 additions & 13 deletions src/wpiutil/Serialization/Struct/Parsing/Parser.cs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
using System;
using System.Collections.Generic;
using System.Text;

namespace WPIUtil.Serialization.Struct.Parsing;

public ref struct Parser(ReadOnlySpan<char> inStr)
public ref struct Parser(ReadOnlySpan<byte> inStr)
{
private Lexer m_lexer = new(inStr);
private TokenKind m_token;
Expand Down Expand Up @@ -32,7 +33,7 @@ private ParsedDeclaration ParseDeclaration()
ParsedDeclaration decl = new(null!, null!, null, 1, 0);

// optional enum specification
if (m_token == TokenKind.Identifier && "enum".AsSpan().SequenceEqual(m_lexer.GetTokenText()))
if (m_token == TokenKind.Identifier && "enum"u8.SequenceEqual(m_lexer.GetTokenText()))
{
GetNextToken();
Expect(TokenKind.LeftBrace);
Expand All @@ -47,20 +48,20 @@ private ParsedDeclaration ParseDeclaration()

// type name
Expect(TokenKind.Identifier);
decl.TypeString = m_lexer.GetTokenText().ToString();
decl.TypeString = Encoding.UTF8.GetString(m_lexer.GetTokenText());
GetNextToken();

// identifier name
Expect(TokenKind.Identifier);
decl.Name = m_lexer.GetTokenText().ToString();
decl.Name = Encoding.UTF8.GetString(m_lexer.GetTokenText());
GetNextToken();

// array or bit field
if (m_token == TokenKind.LeftBracket)
{
GetNextToken();
Expect(TokenKind.Integer);
ReadOnlySpan<char> valueStr = m_lexer.GetTokenText();
ReadOnlySpan<byte> valueStr = m_lexer.GetTokenText();
if (!int.TryParse(valueStr, out int value))
{
value = 0;
Expand All @@ -72,7 +73,7 @@ private ParsedDeclaration ParseDeclaration()
else
{
throw new ParseException(
m_lexer.Position, $"array size '{valueStr}' is not a positive integer");
m_lexer.Position, $"array size '{Encoding.UTF8.GetString(valueStr)}' is not a positive integer");
}
GetNextToken();
Expect(TokenKind.RightBracket);
Expand All @@ -82,7 +83,7 @@ private ParsedDeclaration ParseDeclaration()
{
GetNextToken();
Expect(TokenKind.Integer);
ReadOnlySpan<char> valueStr = m_lexer.GetTokenText();
ReadOnlySpan<byte> valueStr = m_lexer.GetTokenText();
if (!int.TryParse(valueStr, out int value))
{
value = 0;
Expand All @@ -94,7 +95,7 @@ private ParsedDeclaration ParseDeclaration()
else
{
throw new ParseException(
m_lexer.Position, $"bitfield width '{valueStr}' is not a positive integer");
m_lexer.Position, $"bitfield width '{Encoding.UTF8.GetString(valueStr)}' is not a positive integer");
}
GetNextToken();
}
Expand All @@ -117,17 +118,17 @@ private Dictionary<string, long> ParseEnum()
while (m_token != TokenKind.RightBrace)
{
Expect(TokenKind.Identifier);
ReadOnlySpan<char> name = m_lexer.GetTokenText();
ReadOnlySpan<byte> name = m_lexer.GetTokenText();
GetNextToken();
Expect(TokenKind.Equals);
GetNextToken();
Expect(TokenKind.Integer);
ReadOnlySpan<char> valueStr = m_lexer.GetTokenText();
ReadOnlySpan<byte> valueStr = m_lexer.GetTokenText();
if (!long.TryParse(valueStr, out long value))
{
throw new ParseException(m_lexer.Position, $"could not parse enum value '{valueStr}'");
throw new ParseException(m_lexer.Position, $"could not parse enum value '{Encoding.UTF8.GetString(valueStr)}'");
}
map.Add(name.ToString(), value);
map.Add(Encoding.UTF8.GetString(name), value);
GetNextToken();
if (m_token == TokenKind.RightBrace)
{
Expand All @@ -149,7 +150,7 @@ private void Expect(TokenKind kind)
{
if (m_token != kind)
{
throw new ParseException(m_lexer.Position, $"Expected {kind}, got '{m_lexer.GetTokenText()}'");
throw new ParseException(m_lexer.Position, $"Expected {kind}, got '{Encoding.UTF8.GetString(m_lexer.GetTokenText())}'");
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
using System;
using System.Buffers;
using System.Text;

namespace WPIUtil.Serialization.Struct.Parsing;

public ref struct Utf8CodePointEnumerator(ReadOnlySpan<byte> str) {
private readonly ReadOnlySpan<byte> m_str = str;
private int m_index;
public Rune Current {readonly get; private set;}

public int CurrentMark {readonly get; private set;}

public readonly ReadOnlySpan<byte> MarkedSpan => m_str[CurrentMark..m_index];

public void Mark() {
CurrentMark = m_index;
}

public bool MoveNext() {
var status = Rune.DecodeFromUtf8(m_str[m_index..], out var result, out var bytesConsumed);
m_index += bytesConsumed;
Current = result;

return status != OperationStatus.NeedMoreData;
}

public bool MovePrevious() {
var status = Rune.DecodeLastFromUtf8(m_str[..^m_index], out var result, out var bytesConsumed);
m_index -= bytesConsumed;
Current = result;

return status != OperationStatus.NeedMoreData;
}
}
1 change: 0 additions & 1 deletion src/wpiutil/Serialization/Struct/StructDescriptor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ namespace WPIUtil.Serialization.Struct;
public sealed class StructDescriptor
{
public string Name { get; }
public string? Schema { get; internal set; }

internal StructDescriptor(string name)
{
Expand Down
4 changes: 2 additions & 2 deletions src/wpiutil/Serialization/Struct/StructDescriptorDatabase.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
using System;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using WPIUtil.Serialization.Struct.Parsing;
Expand All @@ -6,7 +7,7 @@ namespace WPIUtil.Serialization.Struct;

public sealed class StructDescriptorDatabase
{
public StructDescriptor Add(string name, string schema)
public StructDescriptor Add(string name, ReadOnlySpan<byte> schema)
{
Parser parser = new(schema);
ParsedSchema parsed;
Expand All @@ -27,7 +28,6 @@ public StructDescriptor Add(string name, string schema)
}
// Store non ref in local variable, as we will invalidate the ref later
StructDescriptor theStruct = theStructRef!;
theStruct.Schema = schema;
theStruct.Fields.Clear();
bool isValid = true;
foreach (ref readonly ParsedDeclaration decl in parsed.Declarations)
Expand Down
Loading

0 comments on commit f83d86f

Please sign in to comment.