Skip to content

Commit

Permalink
Spark 3.1.0 APIs - DataFrame (#888)
Browse files Browse the repository at this point in the history
  • Loading branch information
suhsteve authored Apr 8, 2021
1 parent ea83dac commit 7c67ec9
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 2 deletions.
20 changes: 18 additions & 2 deletions src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,8 @@ public void TestSignaturesV2_3_X()

_df.CreateGlobalTempView("global_view");
_df.CreateOrReplaceGlobalTempView("global_view");

Assert.IsType<string[]>(_df.InputFiles().ToArray());
}

/// <summary>
Expand Down Expand Up @@ -696,10 +698,11 @@ public void TestSignaturesV2_4_X()
}

/// <summary>
/// Test signatures for APIs introduced in Spark 3.*
/// Test signatures for APIs introduced in Spark 3.0.*.

/// </summary>
[SkipIfSparkVersionIsLessThan(Versions.V3_0_0)]
public void TestSignaturesV3_X_X()
public void TestSignaturesV3_0_X()
{
// Validate ToLocalIterator
var data = new List<GenericRow>
Expand Down Expand Up @@ -729,5 +732,18 @@ public void TestSignaturesV3_X_X()
_df.Explain("cost");
_df.Explain("formatted");
}

/// <summary>
/// Test signatures for APIs introduced in Spark 3.1.*.
/// </summary>
[SkipIfSparkVersionIsLessThan(Versions.V3_1_0)]
public void TestSignaturesV3_1_X()
{
Assert.IsType<DataFrame>(_df.UnionByName(_df, true));

Assert.IsType<bool>(_df.SameSemantics(_df));

Assert.IsType<int>(_df.SemanticHash());
}
}
}
54 changes: 54 additions & 0 deletions src/csharp/Microsoft.Spark/Sql/DataFrame.cs
Original file line number Diff line number Diff line change
Expand Up @@ -576,6 +576,18 @@ public DataFrame Union(DataFrame other) =>
public DataFrame UnionByName(DataFrame other) =>
WrapAsDataFrame(_jvmObject.Invoke("unionByName", other));

/// <summary>
/// Returns a new <see cref="DataFrame"/> containing union of rows in this
/// <see cref="DataFrame"/> and another <see cref="DataFrame"/>, resolving
/// columns by name.
/// </summary>
/// <param name="other">Other DataFrame</param>
/// <param name="allowMissingColumns">Allow missing columns</param>
/// <returns>DataFrame object</returns>
[Since(Versions.V3_1_0)]
public DataFrame UnionByName(DataFrame other, bool allowMissingColumns) =>
WrapAsDataFrame(_jvmObject.Invoke("unionByName", other, allowMissingColumns));

/// <summary>
/// Returns a new `DataFrame` containing rows only in both this `DataFrame`
/// and another `DataFrame`.
Expand Down Expand Up @@ -1019,6 +1031,48 @@ public DataFrameWriter Write() =>
public DataStreamWriter WriteStream() =>
new DataStreamWriter((JvmObjectReference)_jvmObject.Invoke("writeStream"), this);

/// <summary>
/// Returns a best-effort snapshot of the files that compose this <see cref="DataFrame"/>.
/// This method simply asks each constituent BaseRelation for its respective files and takes
/// the union of all results. Depending on the source relations, this may not find all input
/// files. Duplicates are removed.
/// </summary>
/// <returns>Files that compose this DataFrame</returns>
public IEnumerable<string> InputFiles() => (string[])_jvmObject.Invoke("inputFiles");

/// <summary>
/// Returns `true` when the logical query plans inside both <see cref="DataFrame"/>s are
/// equal and therefore return same results.
/// </summary>
/// <remarks>
/// The equality comparison here is simplified by tolerating the cosmetic differences
/// such as attribute names.
///
/// This API can compare both <see cref="DataFrame"/>s very fast but can still return `false`
/// on the <see cref="DataFrame"/> that return the same results, for instance, from different
/// plans. Such false negative semantic can be useful when caching as an example.
/// </remarks>
/// <param name="other">Other DataFrame</param>
/// <returns>
/// `true` when the logical query plans inside both <see cref="DataFrame"/>s are
/// equal and therefore return same results.
/// </returns>
[Since(Versions.V3_1_0)]
public bool SameSemantics(DataFrame other) =>
(bool)_jvmObject.Invoke("sameSemantics", other);

/// <summary>
/// Returns a hash code of the logical query plan against this <see cref="DataFrame"/>.
/// </summary>
/// <remarks>
/// Unlike the standard hash code, the hash is calculated against the query plan
/// simplified by tolerating the cosmetic differences such as attribute names.
/// </remarks>
/// <returns>Hash code of the logical query plan</returns>
[Since(Versions.V3_1_0)]
public int SemanticHash() =>
(int)_jvmObject.Invoke("semanticHash");

/// <summary>
/// Returns row objects based on the function (either "toPythonIterator",
/// "collectToPython", or "tailToPython").
Expand Down

0 comments on commit 7c67ec9

Please sign in to comment.