diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs index 77891945b..58403b485 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs @@ -667,6 +667,8 @@ public void TestSignaturesV2_3_X() _df.CreateGlobalTempView("global_view"); _df.CreateOrReplaceGlobalTempView("global_view"); + + Assert.IsType(_df.InputFiles().ToArray()); } /// @@ -696,10 +698,11 @@ public void TestSignaturesV2_4_X() } /// - /// Test signatures for APIs introduced in Spark 3.* + /// Test signatures for APIs introduced in Spark 3.0.*. + /// [SkipIfSparkVersionIsLessThan(Versions.V3_0_0)] - public void TestSignaturesV3_X_X() + public void TestSignaturesV3_0_X() { // Validate ToLocalIterator var data = new List @@ -729,5 +732,18 @@ public void TestSignaturesV3_X_X() _df.Explain("cost"); _df.Explain("formatted"); } + + /// + /// Test signatures for APIs introduced in Spark 3.1.*. + /// + [SkipIfSparkVersionIsLessThan(Versions.V3_1_0)] + public void TestSignaturesV3_1_X() + { + Assert.IsType(_df.UnionByName(_df, true)); + + Assert.IsType(_df.SameSemantics(_df)); + + Assert.IsType(_df.SemanticHash()); + } } } diff --git a/src/csharp/Microsoft.Spark/Sql/DataFrame.cs b/src/csharp/Microsoft.Spark/Sql/DataFrame.cs index e9da9fbdf..8e52c8e42 100644 --- a/src/csharp/Microsoft.Spark/Sql/DataFrame.cs +++ b/src/csharp/Microsoft.Spark/Sql/DataFrame.cs @@ -576,6 +576,18 @@ public DataFrame Union(DataFrame other) => public DataFrame UnionByName(DataFrame other) => WrapAsDataFrame(_jvmObject.Invoke("unionByName", other)); + /// + /// Returns a new containing union of rows in this + /// and another , resolving + /// columns by name. + /// + /// Other DataFrame + /// Allow missing columns + /// DataFrame object + [Since(Versions.V3_1_0)] + public DataFrame UnionByName(DataFrame other, bool allowMissingColumns) => + WrapAsDataFrame(_jvmObject.Invoke("unionByName", other, allowMissingColumns)); + /// /// Returns a new `DataFrame` containing rows only in both this `DataFrame` /// and another `DataFrame`. @@ -1019,6 +1031,48 @@ public DataFrameWriter Write() => public DataStreamWriter WriteStream() => new DataStreamWriter((JvmObjectReference)_jvmObject.Invoke("writeStream"), this); + /// + /// Returns a best-effort snapshot of the files that compose this . + /// This method simply asks each constituent BaseRelation for its respective files and takes + /// the union of all results. Depending on the source relations, this may not find all input + /// files. Duplicates are removed. + /// + /// Files that compose this DataFrame + public IEnumerable InputFiles() => (string[])_jvmObject.Invoke("inputFiles"); + + /// + /// Returns `true` when the logical query plans inside both s are + /// equal and therefore return same results. + /// + /// + /// The equality comparison here is simplified by tolerating the cosmetic differences + /// such as attribute names. + /// + /// This API can compare both s very fast but can still return `false` + /// on the that return the same results, for instance, from different + /// plans. Such false negative semantic can be useful when caching as an example. + /// + /// Other DataFrame + /// + /// `true` when the logical query plans inside both s are + /// equal and therefore return same results. + /// + [Since(Versions.V3_1_0)] + public bool SameSemantics(DataFrame other) => + (bool)_jvmObject.Invoke("sameSemantics", other); + + /// + /// Returns a hash code of the logical query plan against this . + /// + /// + /// Unlike the standard hash code, the hash is calculated against the query plan + /// simplified by tolerating the cosmetic differences such as attribute names. + /// + /// Hash code of the logical query plan + [Since(Versions.V3_1_0)] + public int SemanticHash() => + (int)_jvmObject.Invoke("semanticHash"); + /// /// Returns row objects based on the function (either "toPythonIterator", /// "collectToPython", or "tailToPython").