Skip to content

Commit

Permalink
Use Utf8ToStringWithFallback in ToString and exception/logging messag…
Browse files Browse the repository at this point in the history
…e building
  • Loading branch information
paulirwin committed Dec 18, 2024
1 parent a04df3a commit 3f2dbc1
Show file tree
Hide file tree
Showing 10 changed files with 56 additions and 45 deletions.
7 changes: 4 additions & 3 deletions src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ public static void ReadLine(DataInput input, BytesRef scratch)
{
break;
}

scratch.Bytes[upto++] = b;
}
}
Expand All @@ -106,8 +106,9 @@ public static void CheckFooter(ChecksumIndexInput input)

if (StringHelper.StartsWith(scratch, CHECKSUM) == false)
{
// LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
throw new CorruptIndexException("SimpleText failure: expected checksum line but got " +
scratch.Utf8ToString() + " (resource=" + input + ")");
scratch.Utf8ToStringWithFallback() + " (resource=" + input + ")");
}
var actualChecksum =
(new BytesRef(scratch.Bytes, CHECKSUM.Length, scratch.Length - CHECKSUM.Length)).Utf8ToString();
Expand All @@ -124,4 +125,4 @@ public static void CheckFooter(ChecksumIndexInput input)
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public class DefaultSortedSetDocValuesReaderState : SortedSetDocValuesReaderStat

/// <summary>
/// Creates this, pulling doc values from the specified
/// field.
/// field.
/// </summary>
public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = FacetsConfig.DEFAULT_INDEX_FIELD_NAME)
{
Expand Down Expand Up @@ -79,7 +79,8 @@ public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = F
string[] components = FacetsConfig.StringToPath(spare.Utf8ToString());
if (components.Length != 2)
{
throw new ArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.ToString(components) + " " + spare.Utf8ToString());
// LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
throw new ArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.ToString(components) + " " + spare.Utf8ToStringWithFallback());
}
if (!components[0].Equals(lastDim, StringComparison.Ordinal))
{
Expand All @@ -101,7 +102,7 @@ public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = F
/// <summary>
/// Return top-level doc values.
/// </summary>
public override SortedSetDocValues GetDocValues()
public override SortedSetDocValues GetDocValues()
{
return topReader.GetSortedSetDocValues(field);
}
Expand Down Expand Up @@ -132,4 +133,4 @@ public override OrdRange GetOrdRange(string dim)
/// </summary>
public override int Count => valueCount;
}
}
}
2 changes: 1 addition & 1 deletion src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ public override int GetHashCode()
public override string ToString()
{
return "FacetEntry{" +
"value=" + value.Utf8ToString() +
"value=" + value.Utf8ToStringWithFallback() + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
", count=" + count +
'}';
}
Expand Down
34 changes: 17 additions & 17 deletions src/Lucene.Net.Join/TermsIncludingScoreQuery.cs
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ public WeightAnonymousClass(TermsIncludingScoreQuery outerInstance, Weight origi


private TermsEnum segmentTermsEnum;

public override Explanation Explain(AtomicReaderContext context, int doc)
{
SVInnerScorer scorer = (SVInnerScorer) GetBulkScorer(context, false, null);
Expand All @@ -161,7 +161,7 @@ public override void Normalize(float norm, float topLevelBoost)
{
originalWeight.Normalize(norm, topLevelBoost*outerInstance.Boost);
}

public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs)
{
Terms terms = context.AtomicReader.GetTerms(outerInstance._field);
Expand All @@ -181,7 +181,7 @@ public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs)

return new SVInOrderScorer(outerInstance, this, acceptDocs, segmentTermsEnum, context.AtomicReader.MaxDoc, cost);
}

public override BulkScorer GetBulkScorer(AtomicReaderContext context, bool scoreDocsInOrder, IBits acceptDocs)
{
if (scoreDocsInOrder)
Expand Down Expand Up @@ -236,7 +236,7 @@ internal SVInnerScorer(TermsIncludingScoreQuery outerInstance, /* Weight weight,
//_cost = cost; // LUCENENET: Never read
_doc = -1;
}

public override bool Score(ICollector collector, int max)
{
FakeScorer fakeScorer = new FakeScorer();
Expand Down Expand Up @@ -285,12 +285,12 @@ private int NextDocOutOfOrder()
}
}
}

protected virtual int DocsEnumNextDoc()
{
return docsEnum.NextDoc();
}

internal Explanation Explain(int target) // LUCENENET NOTE: changed accessibility from private to internal
{
int docId;
Expand All @@ -314,7 +314,7 @@ internal Explanation Explain(int target) // LUCENENET NOTE: changed accessibilit
} while (docId != DocIdSetIterator.NO_MORE_DOCS);

return new ComplexExplanation(true, outerInstance._scores[outerInstance._ords[_scoreUpto]],
"Score based on join value " + _termsEnum.Term.Utf8ToString());
"Score based on join value " + _termsEnum.Term.Utf8ToStringWithFallback()); // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
}
}

Expand All @@ -326,13 +326,13 @@ internal class MVInnerScorer : SVInnerScorer
internal readonly FixedBitSet alreadyEmittedDocs;

internal MVInnerScorer(TermsIncludingScoreQuery outerInstance, /* Weight weight, // LUCENENET: Never read */
IBits acceptDocs, TermsEnum termsEnum, int maxDoc /*, long cost // LUCENENET: Never read */)
IBits acceptDocs, TermsEnum termsEnum, int maxDoc /*, long cost // LUCENENET: Never read */)
: base(outerInstance, /*weight, // LUCENENET: Never read */
acceptDocs, termsEnum /*, cost // LUCENENET: Never read */)
{
alreadyEmittedDocs = new FixedBitSet(maxDoc);
}

protected override int DocsEnumNextDoc()
{
while (true)
Expand Down Expand Up @@ -360,11 +360,11 @@ internal class SVInOrderScorer : Scorer
internal readonly long cost;

internal int currentDoc = -1;

[SuppressMessage("CodeQuality", "IDE0079:Remove unnecessary suppression", Justification = "This is a SonarCloud issue")]
[SuppressMessage("CodeQuality", "S1699:Constructors should only call non-overridable methods", Justification = "Internal class")]
internal SVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight, IBits acceptDocs,
TermsEnum termsEnum, int maxDoc, long cost)
TermsEnum termsEnum, int maxDoc, long cost)
: base(weight)
{
this.m_outerInstance = outerInstance;
Expand All @@ -374,7 +374,7 @@ internal SVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight,
matchingDocsIterator = matchingDocs.GetIterator();
this.cost = cost;
}

protected virtual void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptDocs,
TermsEnum termsEnum)
{
Expand All @@ -398,12 +398,12 @@ protected virtual void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptD
}
}
}

public override float GetScore()
{
return scores[currentDoc];
}

public override int Freq => 1;

public override int DocID => currentDoc;
Expand All @@ -412,7 +412,7 @@ public override int NextDoc()
{
return currentDoc = matchingDocsIterator.NextDoc();
}

public override int Advance(int target)
{
return currentDoc = matchingDocsIterator.Advance(target);
Expand All @@ -432,7 +432,7 @@ internal MVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight,
: base(outerInstance, weight, acceptDocs, termsEnum, maxDoc, cost)
{
}

protected override void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptDocs,
TermsEnum termsEnum)
{
Expand Down Expand Up @@ -465,4 +465,4 @@ protected override void FillDocsAndScores(FixedBitSet matchingDocs, IBits accept
}
}
}
}
}
3 changes: 2 additions & 1 deletion src/Lucene.Net.Misc/Misc/TermStats.cs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ internal string GetTermText()

public override string ToString()
{
return ("TermStats: Term=" + TermText.Utf8ToString() + " DocFreq=" + DocFreq + " TotalTermFreq=" + TotalTermFreq);
// LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
return "TermStats: Term=" + TermText.Utf8ToStringWithFallback() + " DocFreq=" + DocFreq + " TotalTermFreq=" + TotalTermFreq;
}
}
}
2 changes: 1 addition & 1 deletion src/Lucene.Net.Queries/TermsFilter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ public override string ToString()
}
first = false;
builder.Append(current.field).Append(':');
builder.Append(spare.Utf8ToString());
builder.Append(spare.Utf8ToStringWithFallback()); // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
}
}

Expand Down
3 changes: 2 additions & 1 deletion src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ internal Completion(BytesRef key, int bucket)

public override string ToString()
{
return Utf8.Utf8ToString() + "/" + Bucket.ToString("0.0", CultureInfo.InvariantCulture);
// LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
return Utf8.Utf8ToStringWithFallback() + "/" + Bucket.ToString("0.0", CultureInfo.InvariantCulture);
}

/// <seealso cref="BytesRef.CompareTo(object)"></seealso>
Expand Down
6 changes: 4 additions & 2 deletions src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,8 @@ public PendingTerm(BytesRef term, BlockTermState state)

public override string ToString()
{
return Term.Utf8ToString();
// LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
return Term.Utf8ToStringWithFallback();
}
}

Expand Down Expand Up @@ -468,7 +469,8 @@ public PendingBlock(BytesRef prefix, long fp, bool hasTerms, bool isFloor, int f

public override string ToString()
{
return $"BLOCK: {Prefix.Utf8ToString()}";
// LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
return $"BLOCK: {Prefix.Utf8ToStringWithFallback()}";
}

#nullable enable
Expand Down
23 changes: 14 additions & 9 deletions src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ namespace Lucene.Net.Codecs.Lucene3x
/// <summary>
/// Exposes flex API on a pre-flex index, as a codec.
/// <para/>
/// @lucene.experimental
/// @lucene.experimental
/// </summary>
[Obsolete("(4.0)")]
internal class Lucene3xFields : FieldsProducer
Expand Down Expand Up @@ -344,7 +344,8 @@ private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos)

if (DEBUG_SURROGATES)
{
Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
// LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToStringWithFallback()));
}

// Seek "back":
Expand Down Expand Up @@ -488,7 +489,8 @@ private bool DoPop()

if (DEBUG_SURROGATES)
{
Console.WriteLine(" seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString());
// LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
Console.WriteLine(" seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToStringWithFallback()) + " " + scratchTerm.ToString());
}

// TODO: more efficient seek? can we simply swap
Expand Down Expand Up @@ -599,10 +601,11 @@ private void SurrogateDance()

if (DEBUG_SURROGATES)
{
// LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
Console.WriteLine(" dance");
Console.WriteLine(" prev=" + UnicodeUtil.ToHexString(prevTerm.Utf8ToString()));
Console.WriteLine(" prev=" + UnicodeUtil.ToHexString(prevTerm.Utf8ToStringWithFallback()));
Console.WriteLine(" " + prevTerm.ToString());
Console.WriteLine(" term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()));
Console.WriteLine(" term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToStringWithFallback()));
Console.WriteLine(" " + scratchTerm.ToString());
}

Expand Down Expand Up @@ -679,7 +682,8 @@ private void DoPushes()

if (DEBUG_SURROGATES)
{
Console.WriteLine(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length);
// LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
Console.WriteLine(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToStringWithFallback()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length);
}

// Seek "forward":
Expand Down Expand Up @@ -777,7 +781,7 @@ internal virtual void Reset(FieldInfo fieldInfo)
{
//System.out.println("pff.reset te=" + termEnum);
this.fieldInfo = fieldInfo;

internedFieldName = fieldInfo.Name.Intern();

Term term = new Term(internedFieldName);
Expand Down Expand Up @@ -832,7 +836,8 @@ public override SeekStatus SeekCeil(BytesRef term)
{
if (DEBUG_SURROGATES)
{
Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
// LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToStringWithFallback()));
}
skipNext = false;
TermInfosReader tis = outerInstance.TermsDict;
Expand Down Expand Up @@ -1232,4 +1237,4 @@ public override void CheckIntegrity()
{
}
}
}
}
12 changes: 6 additions & 6 deletions src/Lucene.Net/Util/BytesRef.cs
Original file line number Diff line number Diff line change
Expand Up @@ -248,9 +248,9 @@ public string Utf8ToString()
/// resulting <see cref="string"/>.
/// </summary>
/// <remarks>
/// LUCENENET specific version that does not throw exceptions,
/// primarily for use in ToString() and other methods that
/// should not throw exceptions.
/// LUCENENET specific version that does not throw exceptions on invalid UTF-8,
/// primarily for use in ToString() and other cases that should not throw exceptions,
/// such as when building a message for another exception.
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public string Utf8ToStringWithFallback()
Expand Down Expand Up @@ -604,11 +604,11 @@ public override string ToString()
switch (format)
{
case BytesRefFormat.UTF8:
try
if (bytesRef.TryUtf8ToString(out var utf8String))
{
return bytesRef.Utf8ToString();
return utf8String;
}
catch (Exception e) when (e.IsIndexOutOfBoundsException())
else
{
return bytesRef.ToString();
}
Expand Down

0 comments on commit 3f2dbc1

Please sign in to comment.