From a5d7c6936e21453e1541dbaf9bfbff90d62e7ced Mon Sep 17 00:00:00 2001 From: Venki Korukanti Date: Fri, 10 May 2024 14:13:55 -0700 Subject: [PATCH] [Kernel][Defaults] Support reading parquet files with legacy 3-level repeated types (#3083) ## Description When legacy mode is enabled in Spark, array physical types are stored slightly different from the standard format. Standard mode (default): ``` optional group readerFeatures (LIST) { repeated group list { optional binary element (STRING); } } ``` When write legacy mode is enabled (`spark.sql.parquet.writeLegacyFormat = true`): ``` optional group readerFeatures (LIST) { repeated group bag { optional binary array (STRING); } } ``` TODO: We need to handle the 2-level lists. Will post a separate PR. The challenge is with generating or finding the Parquet files with 2-level lists. ## How was this patch tested? Added tests Fixes #3082 --- ...-922e-c1eb96683964-c000.snappy.parquet.crc | Bin 0 -> 172 bytes .../_delta_log/.00000000000000000000.json.crc | Bin 0 -> 48 bytes .../_delta_log/00000000000000000000.json | 4 ++ ...4a15-922e-c1eb96683964-c000.snappy.parquet | Bin 0 -> 20934 bytes ...-8c3c-3d7e56a7bb45-c000.snappy.parquet.crc | Bin 248 -> 0 bytes ...-8ebb-d2b60d7e69c9-c000.snappy.parquet.crc | Bin 0 -> 176 bytes .../_delta_log/.00000000000000000000.json.crc | Bin 48 -> 48 bytes .../_delta_log/00000000000000000000.json | 6 +-- ...fce-8ebb-d2b60d7e69c9-c000.snappy.parquet} | Bin 30312 -> 21057 bytes .../scala/io/delta/golden/GoldenTables.scala | 27 ++++++++++-- .../internal/parquet/ArrayColumnReader.java | 36 ++++++++++++--- .../parquet/ParquetFileReaderSuite.scala | 41 +++++++++--------- .../parquet/ParquetFileWriterSuite.scala | 8 ++-- 13 files changed, 87 insertions(+), 35 deletions(-) create mode 100644 connectors/golden-tables/src/main/resources/golden/parquet-all-types-legacy-format/.part-00000-5afb67f1-094a-4a15-922e-c1eb96683964-c000.snappy.parquet.crc create mode 100644 connectors/golden-tables/src/main/resources/golden/parquet-all-types-legacy-format/_delta_log/.00000000000000000000.json.crc create mode 100644 connectors/golden-tables/src/main/resources/golden/parquet-all-types-legacy-format/_delta_log/00000000000000000000.json create mode 100644 connectors/golden-tables/src/main/resources/golden/parquet-all-types-legacy-format/part-00000-5afb67f1-094a-4a15-922e-c1eb96683964-c000.snappy.parquet delete mode 100644 connectors/golden-tables/src/main/resources/golden/parquet-all-types/.part-00000-981a72ea-9b1a-4bf3-8c3c-3d7e56a7bb45-c000.snappy.parquet.crc create mode 100644 connectors/golden-tables/src/main/resources/golden/parquet-all-types/.part-00000-bf6680d4-5e83-4fce-8ebb-d2b60d7e69c9-c000.snappy.parquet.crc rename connectors/golden-tables/src/main/resources/golden/parquet-all-types/{part-00000-981a72ea-9b1a-4bf3-8c3c-3d7e56a7bb45-c000.snappy.parquet => part-00000-bf6680d4-5e83-4fce-8ebb-d2b60d7e69c9-c000.snappy.parquet} (51%) diff --git a/connectors/golden-tables/src/main/resources/golden/parquet-all-types-legacy-format/.part-00000-5afb67f1-094a-4a15-922e-c1eb96683964-c000.snappy.parquet.crc b/connectors/golden-tables/src/main/resources/golden/parquet-all-types-legacy-format/.part-00000-5afb67f1-094a-4a15-922e-c1eb96683964-c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..31b63e931aaf12fb66810a50f2e363ab9f0caa52 GIT binary patch literal 172 zcmV;d08{^Ca$^7h00IC{Fq3Jz-8^7m5#+}wp-_igb{eKbT#kkNkf`x1JBRob!^JS& z*P$7TVkVuN9LAvPPdD4)H0q7#lkH`b+jvXR!_FPjpjklD!Vv(^t)x#~p9C(MjLz-k z>~SYILz|J3zksi*yH^~O1;XP2$LOkHW}4_2vRXtPP@7n_K@5^5+%uE(^pRb literal 0 HcmV?d00001 diff --git a/connectors/golden-tables/src/main/resources/golden/parquet-all-types-legacy-format/_delta_log/.00000000000000000000.json.crc b/connectors/golden-tables/src/main/resources/golden/parquet-all-types-legacy-format/_delta_log/.00000000000000000000.json.crc new file mode 100644 index 0000000000000000000000000000000000000000..f859677d5587463fa15939d3b86af088206fe724 GIT binary patch literal 48 zcmV-00MGwpa$^7h00IEeYx)BS=62=-J0X=Xj-sm6o_xERDfdzLJky#f-T@Vlz5Tpo-y0a>-H5TlRr721iu(t&fdsplY5fwXj zY%8{nB^qO5)Yzi2CTff(YApZn*-O)CLdx^L|L1w%JipI;_nf(N=FZ%6=1#$12eFfn zzzL^i!bzEMea?hU5=UW{IXsqvl;d(NRpmL3Gk>e+dAWr$jOSsQ@rrd239tZ`zzU!_ zXANvXIUoaaz=u4$Vpah^Y(Yg}2Py%3AOHoZ43xkDQ~_0iBd7+dgBm~uoPaZ^30#0H za09h~JE#rnfV#i~cmgj_4|oF~P#^dLKj04nKm!m6fil3=9V&z(_C(j0R)CSTGLg-@SX6 z3ev!MkPaq*3@{N)0+YcMFcnM#)4>cd6U+v4z+5m7%mnE-+rbX76YK)J!5**|3&DHa-DhJ1w3ev!MkPaq*3@{N)0+YcMFcnM#)4>cd z6U+j$!5lCb%med5CRhL#f<<64SOS)UWnej20ak)lAPcMp*nE-+rbX76YK)J!5**|K2{?nAfV$-h+(0eh4r+rspf2zLo}eD^20oxZ@CAOr9|V8~AQ1fB zx(UHHEx~_XGkp-VFX#utfDVL%2oMRPKs4wN27nka5DWsbAP&TX1ds>@1Db9A_Dq9q zQb8IR57NN|kO3xwNnkRV3Z{YS;J>VuEW}$4vcVdV1J;6dU_JP!weoN4L@7M2B)oh4 zVuJj=I*~c5&?)n`&V0OtlbCVVJZElgg`aXMX8|mM6_5gJU<1kl8IXhWpaQT36@eY7 z1nhwT6reIt0tZkPID%@RI;bI+I>}I-PP_$Goime~Ok9|_GI3*4i-|jv+Dz&&smsKJ zi6;{;CiR$jGx1?kpNSt6es|bD7LzGM`B%lZ8wcFa+s`T zvX04mCb>*DFxkjt6O+wMwlLYoB#+5}8VAWFM3LOb#$P$m9?c z1Czr{3YZ*Wa+Jw2CdYXTn(9sf`p?LOZXJ%wglw5`@ax>u`sGYWI z)UVffR9{>jrbc`9^aW$pJB%6XtQIrSZoYa<(`9Izt?nMQ5$(6DeSP;~n*w!t&y(u= zZkMp#4fQeAJ#70}y|T)0*#51$v5mQi{fKoZ+K5>#%8L#XI}!U7EmIxDF-@z9&)%p+ z>|eY&(p~Hxq@7Y?M@UWejJfMqs->ruj z_Cs$GeHGh>M~cT(1H?wx2Z`vnSi5(!xU$MHvGS#nB91{ctD7n|wiz#uxSx)`WQb4J zPC|dCh|$s0(5D&VRhL=l*BmkQ)?D;$zIbZY0`zZ@s0mvl?y0^4eOxKlyPAc5W{WvX za?saxVzr*R=<-`<4vIgVGvGK1 z#NIQH;#iK0U$ryhcutC~Wv6jWXT-xN&x#Eue}!ZFTHM_7B98Bp=xTWd$M~JN^yoDl z=MB+r{P#H4+u~&L4vzPp$Z-#F%n!w32Y>eH%(Lj3Yx1fwwh7)6*U+`&8}bUG@-XDX`Y_9*I+y~o_Pw*sa2IV17<5V z7+XztVik=htg7bvXO5bx^{Q+3RIj1wT3)5Wcxz_=xa6 zrfEL4mIisC88@)DrdrQBn*3IEHOL9gukN0jxk@jMkF=f!`Jsus>#ec5=%ZP8yuJpx zqPexkPm?;|UsGd3fTm|;pyoyAAWdeoh8on$J*BYAUdZ3G;!9cSTG=5hd_e7u2UM3} zesE)(i(f8~iPln;qbO`{sk=JZRc@>D5c@|*yWP6xBUh^Y#Usz%mtK?v$yKUgaaO@| zX|$l0yQ@U8iOt1kVe6X7y;Uv5A3N?D+p|$id7!G5xOmZ`y`f*UmaA25MeWUz&)VzS z%eATw;wbyg3tV1xkatjZ#L!gVx#~zKd3RN34BgE0)u)Hb!&F@`wA_;=UAxK$sJdb3 z6Xg!7?(#%c52VJ;I+^!-$cL+XB5gWqD{b#3*QxI&H(whsu*!^^$E8EhQ-Kl;CFUng!-@$w$9r9rQoGfkon!)^V8HRE&|E=t#oifQ&%_TYQkyT8Z zlEhDvVGNS^C$jEezE0w2$#6oF`4_U4-U-S4LK((3*@=_goq03aGmJ}dlFGaucTVBg z$uL+cPPVe4?T@E;Cc321cpumlI7Do9gByY#NaP!ackq-nB>LUQzS^m5s;=vJaCPr^1 z3n@bK#WClR5DWo{;5wfKDM`{N-D(sBV?vUBYXk{~hU7=hk0cnO(GcIPz9bkZl9BwE zBp9zTkbQ-ISFhj&e^Lz#2MR{S_^q04-N?$bZTRGCBj``c6ytU*>ND2?bTa87S z9*bE9EDi(iXEk2iIq9+JXuyIVX!$(yeGi|R9eifZ;4@2x&ulesZ~qls<`TZIyB|E} zk$NoV@%G#?-oC?gcqFdyNV*uXpod|}F8J@jWubt}q9t4w!_-);h5sU4mQuJZ1L3k9 zpvPh%VttNvFX6GO4Ubh10~S*_Eb`%h0GG51T+%jhNk^%%*udM{UV+P+f9qi#43BlJ z9*d>CedHMqKix>xYn*MY~r7l*}EH5U8ee+ZXAuaJbca0#RJSZsv!=#DOGSOJHVyX>#^9%+Xvr-%b_A% z4ucI?WbqX`oQGA#OpirgJXKxbsT!)rB3+Ng4)}k7%Ta*Ku?1X? zLkw8t!2b}EfF`I5iZwM1FE{K z5F$MP@>N{vlCE3$nQJ+H=90SbxdIjKczKO+h2gH1GmmW4pH1Cqe{{CWEE zNrJHMFOUWWnYU1I-qu!}52z1(fgkV(0iXd0w6?MiL5mM*3g7$Ghg}LueLssBI70q_ zFMMUe5ADi=KiYi)iGKr&yTvO5(6B5tK)VJwnm;2D>jSMPfRH~W2cZAMNO)yncS*nNHl%;y8%_pjd{QstcsER%Yt1MPQR7D@7q_RFyJycbHtRRW-!c5Sc z3zwhH*Gq$izJdgw-S{W_P-h$u%m7sz-2d4pvdV}>_eT!kzy3_-g_yW3d}}70Hxqt; z^Wc)ybXkzFzdzZ?7f!7Maank>$d0%4MSIa!lBSX#@s^p3w#_@*PtYkd20NZeRY~W# zw3_2SCClSfp116Xce}6rX50!s`%AE`MC@qa0xQq^wX4*kV<_hJs2-U^`dq}8nfMj6 zI4ADPE`)$<#PYjiuKzYhB~3%rEcYqj2hOP3YEER2%qXi=BUcIrM z%l)R-vgTZ)BdxM5@evDa4WCcso>awJefPGz8~aeRySDLDL#$;gZfM5b=5l=J;=OGp zZCcN9rdD$+@HZMzE9VQlJ=;;MZLO}}sf4e_Xw|g*Jipr1>WlNO)0$vU>OR`UMizKH z!JfDuWwj<(!CJ$}FRy-6kD6I$&U0yj80v-P@~@V|7iYw%buO{46SdkouhxhFYSp~Q zwc~B5)mLAb>+SGO8m+?T=AWrWt)6rnT(vRwq#t)C^8!+EF7~7@Kc}6u1J-iOx=Iqg zs9D{r`wlci41NBr8$)gIJsUAvby;w<2DSRup}n0ywTiwb32RNQo_DU}P!V6e(JEbe zGun+>Rl2%hMtmwAI$Ta>~BSSgAgArH-{HwQ9feW>*ch`cAO2wZ_MC zgcx-BK%eT=>ecd;4ZhTBazx#iT547KQir?>_?(VbIm@g#S8C-Sez09btmKYfyk&{% zT7(^STDmw?ftC83VI8yTQmaH)=@T`zdbikfy%oO1BgFK6w|{k{R*o)9n%AdR8}4>>^rPYd3>u!E2Fd5qMFpI+uhrD0;$!V4NLsmQ>!6PhkwHJ%~G^7zcZ{6vPPwL zQiW=eEX$lO*lxXTGmCP?&}9ALtKJmiOx*Cglq>4qx0cJ%rHY}>u6xmxD*C;3%d3zoS!ktw zax{)og}b<8)V^S>G(^=|m4Z}RjU7FE)U7~?mFn@^%iB_>;7ROQAIcQN%AX!6kSW;+ zQ7>;*BxQ>F;Lz@Qe$=X6qY4~lis8yuzrD8jW{Xw>hdi{TOi{lK$_k}S;U;Z4AxEa< zIAz-^QpWb8Owq3i^k45mt^D3R{8glCJd#v#s}!GmsTw;rFutiq&2IcYwhg6F$- zh*E`n>))#rQe_?XH0@Qzl`hn(>LKG#K~!hCevfi+L0C_97W6XvnSiy169;-}C|C4d z)9hwZuBdVYQh3e~+~xK&VXTwzptt1*4QN8N97x?!6s z3f`uq*?uzZ6ji%kqh6PeT-nUK*nTPLcb=-93;SG9QLY#sO+Ot+xuPFq{?rY*vIX&F z#&s!_E8LQ4{R&X+Dno6v38Uyzhauc^ywT@W?CO?NpDHLa_O3d#TvDij{t`q+Btqn4~nI*z?fJtKC^S<%;@1#f)B*D_q-&8|;xQ z+p(j|ZNkGTSM)Kq4qK_%)h{lbeM800O>F&q7m9rcLI@Q$N+?(K*_n>$=bz_LvFopAZhS_?u8x4&EYxau}q{?1gRcdtIGLJHa+gv63dzvB)!PgzVDO2>PLN)hM_4x?V)8SXvlM6vMEr=<`(ddQ0WBsmPRlXf^*k%_XXO&Slk0H_8-4UewkU$`rkLJ2|#8jyxeIe<(Vy}ZUbs(N*~h;ORV<%L^ZYx*Ec8H4+> zJQt+QK|a(r#7$+OV&}eo@Qs`@#Sjtp`Z$XH5LP~Q*=|pnqE5XZ6Hb}J+4h@R6`5kd z04}c?5<{7y_q_Ld7ZtmD|HkbS$`r1R^Q8kQ_QUAnr8_grQKskzYzXOrOz}|vmhdL0 zHCCdyaMO@)F6=QE?iyF`Wq);7c++r|cSLpK{ah?xQS#)p#oMM*Uye63msnU@Nv&ilL*jBWwWG^TxD;=s-b*xsshRVsgri-gvE%(}W>Uwy3)${hL@9XCu&>%3VVWZ&2 zO`57jO|#}LLRxBDwQke4UHi{EbnMhQv`g1+-Fx)x)w@sMeqp-sh{&ku{sUqL4vLM7 zPe>e`l$3uDwZyEv$JVY{WPd0W}Z{jQgJ+qQ4&Zm~s@ z8|y6zYzftd+Z2r9aS^tdH=@lBa-6t#;Q=*x*{!b=uAEwr6L-5ojeP?K&mepDnpa23 zK4PE#JK4XEm)A#m8r^@%901k!3RkTld-|8_zb4zD_LX7ujZe4i*Fd$dH9|&^-EGF% zEo4u5)BO(F`(wX%L>6ivUm4UHYIJJzd@|Xg`C|=aXVjMdM)ux?vprE3#)no_Bca;r z!?G8X-QkYsmt>FcvfBz5SmVy4>LyU_J>QclWLH_;c>~$)?5^G{d;tJtceeCZRSiEXK`_EBZmTH>bL1l3nTdZKuf&8Q91id21Yg z>1YsCyFuGNk?gCxFXoVKTT6G9Y)$5aO1KhghghVvh8owVywQ_=`BuhmvSpoZACcX( zU=h`saj;L#UQliJ%8j$h{-#2Jk!*{A0= zdqwtJvoqefWNLkqy7q?}BW_+_Ms{Y0m|+mCRmCO3_*>7qouJw}CwELDyLV8NLuAifcj7tOhbwpVz@(#nmT@^8YIOTG zViDOrqJBC{_S6eQEpaK+9&E-nh8mybO-?4ergNoSvb)S)dV}mqoLgnwe;e}?w$OdJ z_NQxs>15YvUyx6Brvq(%COh5Z>skufyO;Fs3pL)ixi^n&$B~0ik=_3OD;^I*+SKmp zfl%Z26Y>PI?*uGZL$)$U^&Q!5gj{-=Y#fvB+X|}P`pf=tWPcwXvWslt{MjGLZYlPt zi3cj}$gSV^fEqW}h@D0D^%>8PlU?!c*f(T1kCXc0q02b*yV(Pw+VyR!t|I$Ne)dJO z%h&NLUlq1`(eCC@ZL(AxMfRFuC%2LP?VZl|$hPTvwK`r*Ya1Vn?gBL?_&u3QcGl_< z1!P~aGk;08Rob+A)u6FqE0#iy%g?%hMfR6X@~rW96-M*Tjnq(WK(%8-$sRDR{U)-P z{C@Ek*=Gjos?>nZU4GC3YOLQXWg^*8d)^!%dqJ&<&&WQRX<0 z>uzuSg6w&n11y|iA1^r62&#SQ(<+JVx~tBwBfGC{@9SjG8ux?X4ExBV__k2v^FA-q z$aX)Ix|i&p!R4NiJ$=JGx0LKH?zpjtz&xY@ZY@d=IK>@@XFf+z1ntt z@qgykcFD^*U%baEArQZbv88-X@j|XNV?X(VuH=PWQ@n&LhF->U&3*MUM5VD2`i^O+s58k75;lEz2%c&KPsBn?-b_dZ{8@(%e{H`=H0k= zzl?i#-O=29%sWSO`SW-0UXO$K-Sv0l?98q4f1Xe%_ ztO4#I{_-!}G3%kM@qrOwruj#&?QrG!*MHgG5HT8oV9*#e0cs!u4QK|MgAmXXXhAE` z8nglJKzr~R=m0u`PM|Xg1zkW_&<%74JwQ*;3-kefK^V}1a1a3^K@^Av{lNeb0|tUY zAQr@dc#r@R!C;UAhJc}97#I#lfRSJn7!AgNaX=4JK^hnj(m@882quBaU@Djfrh^$^ zCYS|ggE?R>mCuoNr<%fSk;5@dnZARDX!IbbbV2iAjJun}wmo52>a z6>I}}U_00Wc7k1CH`ojE!9K7b90CS#7!-gb;3zl-j)M~bQ;MUY624w?_nuktHy%rT zxc4*1+F9nN_tTL7-wrRi|HXN6sA%S(l+eG-gHYlLW)2EV{AmEWdDsxsc)*Q6-v5}} zj=7~~Xifh)D5XaB%M{GEa4Y%)S_CuwEg^yxx}`;bEr_j(|7xaWUls`Dt#(jC>BD z3X8P-zivA!q{3U70tf2gsK}5D&x(G(auCiH{XXd?_%m;n{JvA__paL|{)`_=et%Qycg~$cKN8@zr&gAn zs7o5uFF9gBRGdz5$N0ILDW#ebDN&~EzJx7}DtTG5m)@LP8tkdTyOTA{)U+u{}F zTE(YC^^Z!TRrriUA=)Oy_h+V*whW0)(3!U61RK~k%@c-%$C~!R3FQSF!Axlt85I!| zr;By9!FM2gkKVMT9dl@2PLq%j8>Ne90c8}Rb4pSS+k-3D){z>T+?tqpUD62F&U7fv zb%lo_YfJNzV2O?8p)qk$$tk+H#F7r%7W>+V_9VBfecNevjvm+s|K1GS#AA!7$bQKw zNkbx1@L|fdKQo=KaQA3pri&2F*cmj}Md)a&^17rX-H3h((ftyWV&akoGr_WLFa?w- z&BJy5i<(=Qn$ZRoiZ?L*_}CU`#oGUMG~426tlNJPjt!leE}VKIi_;~R?zmZ7Iv41; z^`NK`{f6pdheTnBa30KDIi)##Y-Y#XFk0v;PqM24%JpJO#f5cmQ zM~h86`jH(#UT;UCeBq9YE?jIc8Mcaz72NWA2M1WXvss%aZ~EKTQdyE`ij{kx7wU6v z*9IYd3S*VN(6M;wydninf-&1Gc@BwZ($c|BBA7z)9fCP?Zs%hi1zFJR-b>agO6#K!7yOYPhcH>gn_ z&P8#Z8;%+?#-n)q;yd_~0LgSyU-k&4ZuLuI#IXDSvPUVpwN3LjZY1z^uG?mpfg^=p?sPN~P6l33yFnWyYgN~ZggAV?)V zCY3!#$y8qwBi!^%RrVM~vwTSeB>3OR(c&jDdJHQGl2Y{0R`#A0M=E@#`$({U@hKzA z9`L<~z~X3SDO{bdxH;AL-x?HMMDey|+EqlUeU-5?`;7UEOUPd~D7nTIKXjU&dH>jT z(|Wk0Vx!`s;!SS_aAhm$XhcGMiY_KTxjimv>~i*3o%_hX*~4NPcV2u}|54Zelm4=6 ze)8Y#Fxw1U{c)<$g%<~NF3ZK0M)=54%#_@&EV?|5u0hsr>%%JeNMB>?9W-**{A5f7t&%V>o5&!uuZ2xFl8K zjgrd8%g5KtN97(qBqlae6&dd56X@&X9Te#w-oV!{D#Aa=*E=$>fpYQ literal 0 HcmV?d00001 diff --git a/connectors/golden-tables/src/main/resources/golden/parquet-all-types/.part-00000-981a72ea-9b1a-4bf3-8c3c-3d7e56a7bb45-c000.snappy.parquet.crc b/connectors/golden-tables/src/main/resources/golden/parquet-all-types/.part-00000-981a72ea-9b1a-4bf3-8c3c-3d7e56a7bb45-c000.snappy.parquet.crc deleted file mode 100644 index 8510f65adb28667664c2fb1601c04f87523f79ec..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 248 zcmVy`&&cVB^7QM;i-CEJbLqe0WqIBEiumE2Tv@R|$`jCt1p diff --git a/connectors/golden-tables/src/main/resources/golden/parquet-all-types/.part-00000-bf6680d4-5e83-4fce-8ebb-d2b60d7e69c9-c000.snappy.parquet.crc b/connectors/golden-tables/src/main/resources/golden/parquet-all-types/.part-00000-bf6680d4-5e83-4fce-8ebb-d2b60d7e69c9-c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..48e29b2da0bf6fcb2d38e654928f9c5bb6b5cc25 GIT binary patch literal 176 zcmV;h08js8a$^7h00IC{Fq3Jz-8^7m5#+}wp-_igb{eKbT#kkNkf`x1JBRob!^JS& z*P$7TVkVt}nCSX8GmU&uQXR8*g;H5z(K@yc=@-~~+(B7<@1Q+&HAm{Dh;y`&&cVB^7QM;i-CEJbLqd}h$H!_SiaDS66=2P=pLW(9Cp{4AOT`q`_6`A} eZbY{%@X6AVx+UNl+5&gEC>U%!)a=AR$gB97k5;?@ literal 0 HcmV?d00001 diff --git a/connectors/golden-tables/src/main/resources/golden/parquet-all-types/_delta_log/.00000000000000000000.json.crc b/connectors/golden-tables/src/main/resources/golden/parquet-all-types/_delta_log/.00000000000000000000.json.crc index 5d44c353677503529bed80347360bca781f6be8f..910ab11c13db642541612415b7f9a10de4991e41 100644 GIT binary patch literal 48 zcmV-00MGwpa$^7h00ID)*S<&y=62=-J0X=Xj-sm6o_7rab diff --git a/connectors/golden-tables/src/main/resources/golden/parquet-all-types/_delta_log/00000000000000000000.json b/connectors/golden-tables/src/main/resources/golden/parquet-all-types/_delta_log/00000000000000000000.json index 929e5de0e7f..5d5698d25f6 100644 --- a/connectors/golden-tables/src/main/resources/golden/parquet-all-types/_delta_log/00000000000000000000.json +++ b/connectors/golden-tables/src/main/resources/golden/parquet-all-types/_delta_log/00000000000000000000.json @@ -1,4 +1,4 @@ -{"commitInfo":{"timestamp":1713368423544,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"200","numOutputBytes":"30312"},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.2.0-SNAPSHOT","txnId":"d711414e-08c1-46d0-a5c5-d5faad64d59e"}} -{"metaData":{"id":"7027c1fa-69c4-4867-ace8-6f8c44a022d0","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"ByteType\",\"type\":\"byte\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ShortType\",\"type\":\"short\",\"nullable\":true,\"metadata\":{}},{\"name\":\"IntegerType\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"LongType\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"FloatType\",\"type\":\"float\",\"nullable\":true,\"metadata\":{}},{\"name\":\"DoubleType\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal\",\"type\":\"decimal(10,2)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"BooleanType\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"StringType\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"BinaryType\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}},{\"name\":\"DateType\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}},{\"name\":\"TimestampType\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"TimestampNTZType\",\"type\":\"timestamp_ntz\",\"nullable\":true,\"metadata\":{}},{\"name\":\"nested_struct\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"aa\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ac\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"aca\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"array_of_prims\",\"type\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"array_of_arrays\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"array_of_structs\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"struct\",\"fields\":[{\"name\":\"ab\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"map_of_prims\",\"type\":{\"type\":\"map\",\"keyType\":\"integer\",\"valueType\":\"long\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"map_of_rows\",\"type\":{\"type\":\"map\",\"keyType\":\"integer\",\"valueType\":{\"type\":\"struct\",\"fields\":[{\"name\":\"ab\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"map_of_arrays\",\"type\":{\"type\":\"map\",\"keyType\":\"long\",\"valueType\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":true},\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1713368421437}} +{"commitInfo":{"timestamp":1715358308005,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"200","numOutputBytes":"21057"},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.2.0-SNAPSHOT","txnId":"c84f1a78-0895-4f01-b00e-f3a984c8afca"}} +{"metaData":{"id":"ab49cd9e-a908-4aad-a15b-9dd117d3e0ab","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"ByteType\",\"type\":\"byte\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ShortType\",\"type\":\"short\",\"nullable\":true,\"metadata\":{}},{\"name\":\"IntegerType\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"LongType\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"FloatType\",\"type\":\"float\",\"nullable\":true,\"metadata\":{}},{\"name\":\"DoubleType\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal\",\"type\":\"decimal(10,2)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"BooleanType\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"StringType\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"BinaryType\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}},{\"name\":\"DateType\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}},{\"name\":\"TimestampType\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"TimestampNTZType\",\"type\":\"timestamp_ntz\",\"nullable\":true,\"metadata\":{}},{\"name\":\"nested_struct\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"aa\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ac\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"aca\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"array_of_prims\",\"type\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"array_of_arrays\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"array_of_structs\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"struct\",\"fields\":[{\"name\":\"ab\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"map_of_prims\",\"type\":{\"type\":\"map\",\"keyType\":\"integer\",\"valueType\":\"long\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"map_of_rows\",\"type\":{\"type\":\"map\",\"keyType\":\"integer\",\"valueType\":{\"type\":\"struct\",\"fields\":[{\"name\":\"ab\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"map_of_arrays\",\"type\":{\"type\":\"map\",\"keyType\":\"long\",\"valueType\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":true},\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1715358307675}} {"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["timestampNtz"],"writerFeatures":["timestampNtz"]}} -{"add":{"path":"part-00000-981a72ea-9b1a-4bf3-8c3c-3d7e56a7bb45-c000.snappy.parquet","partitionValues":{},"size":30312,"modificationTime":1713368423449,"dataChange":true,"stats":"{\"numRecords\":200,\"minValues\":{\"ByteType\":-128,\"ShortType\":1,\"IntegerType\":1,\"LongType\":2,\"FloatType\":0.234,\"DoubleType\":234234.23,\"decimal\":123.52,\"StringType\":\"1\",\"DateType\":\"1970-01-01\",\"TimestampType\":\"1970-01-01T06:30:23.523Z\",\"TimestampNTZType\":\"1970-01-03T17:03:54.000\",\"nested_struct\":{\"aa\":\"1\",\"ac\":{\"aca\":1}}},\"maxValues\":{\"ByteType\":127,\"ShortType\":199,\"IntegerType\":199,\"LongType\":200,\"FloatType\":46.566,\"DoubleType\":4.661261177E7,\"decimal\":24580.48,\"StringType\":\"99\",\"DateType\":\"1970-02-16\",\"TimestampType\":\"1970-02-23T22:48:01.077Z\",\"TimestampNTZType\":\"1971-06-24T11:56:06.000\",\"nested_struct\":{\"aa\":\"99\",\"ac\":{\"aca\":199}}},\"nullCount\":{\"ByteType\":3,\"ShortType\":4,\"IntegerType\":9,\"LongType\":8,\"FloatType\":8,\"DoubleType\":4,\"decimal\":3,\"BooleanType\":3,\"StringType\":4,\"BinaryType\":4,\"DateType\":4,\"TimestampType\":4,\"TimestampNTZType\":3,\"nested_struct\":{\"aa\":14,\"ac\":{\"aca\":22}},\"array_of_prims\":8,\"array_of_arrays\":25,\"array_of_structs\":0,\"map_of_prims\":8,\"map_of_rows\":0,\"map_of_arrays\":7}}"}} +{"add":{"path":"part-00000-bf6680d4-5e83-4fce-8ebb-d2b60d7e69c9-c000.snappy.parquet","partitionValues":{},"size":21057,"modificationTime":1715358307997,"dataChange":true,"stats":"{\"numRecords\":200,\"minValues\":{\"ByteType\":-128,\"ShortType\":1,\"IntegerType\":1,\"LongType\":2,\"FloatType\":0.234,\"DoubleType\":234234.23,\"decimal\":123.52,\"StringType\":\"1\",\"DateType\":\"1970-01-01\",\"TimestampType\":\"1970-01-01T06:30:23.523Z\",\"TimestampNTZType\":\"1970-01-03T17:03:54.000\",\"nested_struct\":{\"aa\":\"1\",\"ac\":{\"aca\":1}}},\"maxValues\":{\"ByteType\":127,\"ShortType\":199,\"IntegerType\":199,\"LongType\":200,\"FloatType\":46.566,\"DoubleType\":4.661261177E7,\"decimal\":24580.48,\"StringType\":\"99\",\"DateType\":\"1970-02-16\",\"TimestampType\":\"1970-02-23T22:48:01.077Z\",\"TimestampNTZType\":\"1971-06-24T11:56:06.000\",\"nested_struct\":{\"aa\":\"99\",\"ac\":{\"aca\":199}}},\"nullCount\":{\"ByteType\":3,\"ShortType\":4,\"IntegerType\":9,\"LongType\":8,\"FloatType\":8,\"DoubleType\":4,\"decimal\":3,\"BooleanType\":3,\"StringType\":4,\"BinaryType\":4,\"DateType\":4,\"TimestampType\":4,\"TimestampNTZType\":3,\"nested_struct\":{\"aa\":14,\"ac\":{\"aca\":22}},\"array_of_prims\":200,\"array_of_arrays\":200,\"array_of_structs\":200,\"map_of_prims\":200,\"map_of_rows\":200,\"map_of_arrays\":200}}"}} diff --git a/connectors/golden-tables/src/main/resources/golden/parquet-all-types/part-00000-981a72ea-9b1a-4bf3-8c3c-3d7e56a7bb45-c000.snappy.parquet b/connectors/golden-tables/src/main/resources/golden/parquet-all-types/part-00000-bf6680d4-5e83-4fce-8ebb-d2b60d7e69c9-c000.snappy.parquet similarity index 51% rename from connectors/golden-tables/src/main/resources/golden/parquet-all-types/part-00000-981a72ea-9b1a-4bf3-8c3c-3d7e56a7bb45-c000.snappy.parquet rename to connectors/golden-tables/src/main/resources/golden/parquet-all-types/part-00000-bf6680d4-5e83-4fce-8ebb-d2b60d7e69c9-c000.snappy.parquet index 275a7aa46ef043ca73c3927919e736e7c2c4cf73..4e5e07fcd60482359d0a203abc774a8215f715d8 100644 GIT binary patch delta 1507 zcmc(eO=uHQ5XawUONdpA72Zp>EZ9b~qS9KcHCl>j5xl6Rdefp{^JP-Ss}zZpAO$@v zY{62cNGTFQAtq2n1vRHwMZ8#ocnI;Mt5%I4hoBed&D$k)JzLqp&dzWCnU6QmT#z13 zOW9jHQiQ6smA<$#Z@N^f%al+zbraIGO(ulsGO=FZ|0i*&|CQLv8(*r9^JfM^Qc6;M z1b)<|Fqo~aM+5U*t=r}z`_8+%e{{ZX zm$;pOd3=rRA|B({iQTO7MWT#lOo?_7?ZQeGrqKC-A*foL|;#U=sy*nL3jh<0h%MysP;-Wfz? zf$mH;+m2O}UqCL$fXGGJp;1&Hg#*TPLzHvnk6wq(( zF)&fYaBQddDzYjtC}LtxWP<5Fi5+A!$i_ZlCOCIqMSvKa$;9PsH3KmVQQ82OAjVA-r3 zZHbnkoQ#~MF1RgNA26qO9JNzI3(Ls|1x>Xj*4p)f(<#cLs31hJ#l}=(!+Jgdu${3* zjID@R*m7fevZI~Hg6)hgi$-9JVOVertYY+}-!2B6PEk#CAVhFoj8#Lyt@eTZ9j^bt e1Ho`szRYK%dDt~m%8v~_UFHc;LPC2EZTSsf;z_aq delta 7592 zcmciH30xFco(J%%z8ktZgenf38(QR&L%Bq`54i*ZMdUsd5ESv&NTULR0s@7oc%cC! zEh2aTDu)_Hqpss-{mjOgIFn3q)4dZ6(?a=A<5IBq}P z$Yl$Uaem%1PAbyl%tV>0+Z07H6f-Dgo{O2A|MuvhmV)_F!Uz}-$mn&CIE9R6=v=y#92dDV-n;7wX7G67!b^ zF1nOZ_F@Z7Niry!=JEJ^fj}q}iNs=wL@L$N($<#AMyU0ppref??E3=9kn8HQzz zjEqf8O{Y&+n3IXBBP?CV`5@s z7cPv8i;rKlC?O#+F)3;BlH}y2OH)#oElW*JOG{7B$jHph%F52p$<51KzI?@s{QQD~ z!orm+i;9YiSFKvTx};>yn$ps>>(;N|uwmmy)uv6GH*eXpb?ZCZ%F0x#^78H5ckI}? zv!bH1vZ`v=uIlQVn%%qi?5VA-tE;cyySJgSsfpv7oAhYlS+eB{W{ zqsNZ5wH-fx;>5e}wzqe5oILs7d#6rycB<8D2~+`04TFI>2Iv9GVc ze_-I!rOTJEsICkS4h;+ipRoIe{IM#xB{ zn_sxl*hoS~1{8of(l(Ha?d4Dcc1Sxw4z`P7BRC=L28GyO2W8-a^ct+ULM8Yj?GJ$v z1}fy$kOc=Z$f&})0a{@p(zRG8VqJ&zemD+`kzNMP*gguUAPwnkIE3vEI19N*7r+T@ zcR?SlM7k5}^DqRZNUJttu^GysADP$SE>t2tj5Uc5AHpuAZ(v=E^-ZiFK|Qt+JMN2^ zdJ%K~$=BUd1@mX&MC^EvA5Cs!MuQb)Cka>}qU=m~aZOfcV()+|s@W1rsySDMk}5N? zC55sbNK>9}6s@Dkl>|t!Gc9++Npha&<@v&Vo&+8e zb-cWNRKBxj&Gnl%-`~R{ATTI6#K$KzEIfjQnwa3=*o6@haq){1qN5W@m_d|TmYSBH zkdTp?m7TL>Np9Zq75Qmt1%)e%idU^JS+inA>Dr>Ab?Y~5EGgNvc?&|ywlbBld^>qY z?F!~A1E!(9;gC?lw26?TiSJELI))y2NO|h(c$WKVFNtAj{NMQ${zdYUFQ+M5E}?k9 zmrJ;Rmo8PAVtYC$zzod60xZD_tic9s!48x#1MFcYIDjKKfit+kEN}%kaF1Qv5w@ zt8eGrybb8nT%h+XA!*2coUh12cokbCT)N0eMgC;+=aUBg0pJyQAZ7-|%$1v;cZd{B zg=nJTKII#ez?zeoB&4ZDLXlu!q);sAaKNHdEi@CT)wG#pA6e#i@MdFA{wZ}jkDsqj zj$BtqPgh@Wn!dp_Lj%T;WsF#3BNJm&lj)|KuHDkg!rIEl+SbO-R%tgw zX+LA8y~9jL2Pa2oCl}{gF0QlOT;1I~+-G~tne92pi}dY2zAA5Y?(^pR`OgpV4-5zj z3=Rqj4h;zl4G)V5Ul0+wASyCCDkeHMW?^jH!g$i?642%nmn=?RqG@(1X{qUH8R?lB zS(({cIoY{6dAZB;RxHn7QIKC)u(Gg-G{GWt?rTa*)|ReYyMEn<^&2;$5pLSL<(;kD z-YMIrQk9jfwwLeNzH>*#&dQ3a%3W2vs%v)bu0iKsS6{ogzF}`;LsKKy)XeQe?|z`= z;DOeIhguIGI&%2vkz+^Ojva41as0%)?I$|gPj;Yl?^K^Ut?oK~rmOqR+3udRy*=l8 z(YasjyU^b^(0^&*@}(=62d@kb4h@eCT^+f0^?LF(bnQ1sZ)o0pJn07}%nmto!yNDgFYpE*xkN?zVq-4&!917`{ty5`5DXy@2H_A05wHLvAqqkv8e$+8 z7D61v!y-t4L`Z_gumqA}saz$YQn0ZMQXw5OAQQ468`2;b@?bfvfP5%`94Lg9Pz1%W z3RXi2tbtNk3+rG#Y=Dih3AU)P*b48!HYfv2z-YG60)fm*18 zde{pM&g*WrB_f*WuXM&TCRh7aHl+=Y8^ zUoP?c4MH3FJs4Y&h>38i7Bl+}?0t>Tq&E>$_2i!pnSu@;?b*P8^*Wu;-}E|_&mUB! z@`bNns$NH9HS$^&%@@CV{qnWywI40TH_*uI*D60g|CPTaO*8Jmu!D>`WO7=EuPf9O z>r1C;8>r-lI*cx>XQXdD&BVaea5|%4&5X>AEley;t*p&$ENm_9tS}0(p6TFd<7De> z=c1ft?>f`X(cQ_zdA7@(S)Q(5Zr<)b9=@~Z&hhh{=RM!wH^47&9tI@yL;b@7Fd_+z z42lYg4vh(m4PUq*E;2r9QFKB~Vr;5H76}M z9Rrli{H%iP!km@4MR~<5R^_iQC@EaCvb1Pz@w!#(OE#?8Sh{KRx-IK5R@qp#Nwv9r z%l551w(YE_EUQxODzDyNvt#$pJ(aapb-U`T_trG*Zmeypf8bzKD|cw$ z;g%!&RYwmVYi&D(_;|SeNXOBW$KGo@b-eRkb^Ga#u9Ig@b$6as_nhwSI)?z+-FLRX zXQ21e`O6ot^bPh84Gdozxq9vL^(*fW-Wa+$JUVjg+U@Hf+_-b|?&v*DC;5qt!N13U z`tEbFf{YKwM^4=$V=_8uK*SU)*&@}47aKpT1GGVB&1mrDHf$cGz# zAC0fkMU)0 zyI~L1!aoZBB*stf092$0nCJi|2Y(Xe-_`;An&3}@{M$N!DT6-=@^9+^CIx@LiLU@M zBql$!?a#$byO^nZSb0Ksa)AHt=lD+35$Z)rNl6(0MkXb7sMTtfLsF-jrX$U$R2~`r zW5gTF6LzW{)P7{(YmU(_Ewt@adyWl^6KVZOYYM|)?4s5TgMZBom$Av%3>W#l9G~G* zy>*Jq)y>V_-DC0$myfS6M!{3faFO|~$f&7jxE3!Vb6js`xW?wUa&pIJxKwYO;##|I zY?5nYhKqcU{^L2WhQ@z>j!QGcrP54so#}ov$@R-Au8A40$@5#IW5ZyIxh-X0iJ7|J ze|8_wyG=Y1-Mb|IEjP=~U@NW47W1O7aV$+AC+`jN^`2&u+RF|`~{)^-dw5`BOK`!J$J`_L^6hjG+FX44Sp1?b>4a#9VR6r$ELk-lDQ`BLj8TP>eI0%Q~ z2(-a*Xon8yf-}$qy)XcmU=W7j2HeCPnSpN{t>T#&qrbyMrm^5QRt(RCMTc*qKPg`` z2R12Rg8ei<6JyCle3|OF53m!-SBzCBA^kr2;jjl|TND$Fa{FJNZsIbqBe{ynXZyM5 z#!*jsC2jopF|+goIrrTPjl`H5%+xVl~tqP(oDOAOwgJr*zR&3b6qr^cwWV0 zS`Eh&<)?V~KZ5bZ(IFo5WFqnE29IfwkC)Vb$YXBv#uFcZ#$!I3*zMdOc}$Pwyxu$u zJ`dj#ZUF)8mp}0^Z~yssJjR7mR>>8(BXdx+Dgm#o}p+yrF`zIU?iHNw!k3vx~53|pPCA9hbtH@P0^2m3GBz3 zqK{C7qt2yHRG1ay$9}IVYI&h4YA9APl;)`OL0;_cSLEU!qR5(3R;*GmGE&qO?9R5K z#F+g7CAO5ZcI$Wv7ZL1kG@vPJ{fnmP@*aBZ?-Fp0!7)Y;P-e;wqYQVWy$fX`)F|No z66~x0H7TQ%4L!&m`*Q)cV`;Gf^BJ z8p{r7%I>2KXTtL*S79gM@u;}V3S&vi#0ld> z0mCC*Cs|SKhnkvSp=MV~iI-`-=KfIEg?|>Z!@6vhF3DDY{Z~>mX+4u_ev_P{=4YCk zyZ)-F8E+=1ALg9WCSpHQu&stlWca;lac>$1x#FK2;+t%|>bPXI0$g&{-^oq<1QiER zO1$Ocmpl??x{SXvZ!^PVD@5#bL-s2}QmWj2h=(gD?RWgj750--Q(SnPrvAe}YHl_f z4tapM*&|^#>8SXb68`i*}M39P;S6uDEJ=(6TScW@%S4QP~{EPQ4F z8fJr>r2KyIAgNzI$2WK5p{A1RziF-(jhRagcS&gzu(x?iq;QWX;>??yiEv{-MfT)- z{AW$AeW(@p7*S#D9*9i5Ia!3U>dkGwZpA(^Bs(Y{J|uU8#5?jcmXJ71eyqZNirgvg zM)hm*SbxANN!$42*kh$cCdu;Rtdfk?%QLb{OS3kwm9EU=%p%NU+QjTDDcd6 + withSQLConf(("spark.sql.parquet.writeLegacyFormat", "true")) { + generateAllTypesTable(tablePath) + } + } + generateGoldenTable("parquet-all-types") { tablePath => + // generating using the standard parquet format + generateAllTypesTable(tablePath) + } + + def generateAllTypesTable(tablePath: String): Unit = { val timeZone = java.util.TimeZone.getTimeZone("UTC") java.util.TimeZone.setDefault(timeZone) import java.sql._ @@ -1113,6 +1124,7 @@ class GoldenTables extends QueryTest with SharedSparkSession { if (i % 61 != 0) new java.sql.Date(i * 20000000L) else null, if (i % 62 != 0) new Timestamp(i * 23423523L) else null, if (i % 69 != 0) LocalDateTime.ofEpochSecond(i * 234234L, 200012, UTC) else null, + // nested_struct if (i % 63 != 0) { if (i % 19 == 0) { // write a struct with all fields null @@ -1121,6 +1133,7 @@ class GoldenTables extends QueryTest with SharedSparkSession { Row(i.toString, if (i % 23 != 0) Row(i) else null) } } else null, + // array_of_prims if (i % 25 != 0) { if (i % 29 == 0) { scala.Array() @@ -1128,6 +1141,7 @@ class GoldenTables extends QueryTest with SharedSparkSession { scala.Array(i, null, i + 1) } } else null, + // array_of_arrays if (i % 8 != 0) { val singleElemArray = scala.Array(i) val doubleElemArray = scala.Array(i + 10, i + 20) @@ -1144,7 +1158,11 @@ class GoldenTables extends QueryTest with SharedSparkSession { case 6 => scala.Array() } } else null, - scala.Array(Row(i.longValue()), null), + // array_of_structs + if (i % 10 != 0) { + scala.Array(Row(i.longValue()), null) + } else null, + // map_of_prims if (i % 28 != 0) { if (i % 30 == 0) { Map() @@ -1155,7 +1173,11 @@ class GoldenTables extends QueryTest with SharedSparkSession { ) } } else null, - Map(i + 1 -> (if (i % 10 == 0) Row((i * 20).longValue()) else null)), + // map_of_rows + if (i % 25 != 0) { + Map(i + 1 -> (if (i % 10 == 0) Row((i * 20).longValue()) else null)) + } else null, + // map_of_arrays if (i % 30 != 0) { if (i % 24 == 0) { Map() @@ -1181,7 +1203,6 @@ class GoldenTables extends QueryTest with SharedSparkSession { .save(tablePath) } - def writeBasicDecimalTable(tablePath: String): Unit = { val data = Seq( Seq("234", "1", "2", "3"), diff --git a/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/parquet/ArrayColumnReader.java b/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/parquet/ArrayColumnReader.java index 1335badc312..5d111846f84 100644 --- a/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/parquet/ArrayColumnReader.java +++ b/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/parquet/ArrayColumnReader.java @@ -19,10 +19,13 @@ import org.apache.parquet.io.api.Converter; import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.Type; import io.delta.kernel.data.ColumnVector; import io.delta.kernel.types.ArrayType; +import static io.delta.kernel.internal.util.Preconditions.checkArgument; + import io.delta.kernel.defaults.internal.data.vector.DefaultArrayVector; import static io.delta.kernel.defaults.internal.parquet.ParquetColumnReaders.createConverter; @@ -52,15 +55,38 @@ public ColumnVector getDataColumnVector(int batchSize) { return arrayVector; } + /** + * Currently, support for 3-level nested arrays only. + *

+ * optional group readerFeatures (LIST) { + * repeated group list { + * optional binary element (STRING); + * } + * } + *

+ * optional group readerFeatures (LIST) { + * repeated group bag { + * optional binary array (STRING); + * } + * } + * + * TODO: Add support for 2-level nested arrays. + */ private static Converter createElementConverter( int initialBatchSize, ArrayType typeFromClient, GroupType typeFromFile) { - final GroupType innerElementType = (GroupType) typeFromFile.getType("list"); - return createConverter( - initialBatchSize, - typeFromClient.getElementType(), - innerElementType.getType("element")); + checkArgument( + typeFromFile.getFieldCount() == 1, "Expected exactly one field in the array type"); + GroupType repeatedGroup = typeFromFile.getType(0).asGroupType(); + + // TODO: handle the legacy 2-level list physical format + checkArgument(repeatedGroup.getFieldCount() == 1, + "Expected exactly one field in the repeated group"); + + Type elmentType = repeatedGroup.getType(0); + + return createConverter(initialBatchSize, typeFromClient.getElementType(), elmentType); } } diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/parquet/ParquetFileReaderSuite.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/parquet/ParquetFileReaderSuite.scala index ae307f91ea7..d88c36ed245 100644 --- a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/parquet/ParquetFileReaderSuite.scala +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/parquet/ParquetFileReaderSuite.scala @@ -21,6 +21,7 @@ import io.delta.kernel.defaults.utils.{ExpressionTestUtils, TestRow} import io.delta.kernel.test.VectorTestUtils import io.delta.kernel.types._ import org.scalatest.funsuite.AnyFunSuite + class ParquetFileReaderSuite extends AnyFunSuite with ParquetSuiteBase with VectorTestUtils with ExpressionTestUtils { @@ -73,19 +74,22 @@ class ParquetFileReaderSuite extends AnyFunSuite checkAnswer(actResult, expResult) } - private val ALL_TYPES_FILE = goldenTableFile("parquet-all-types").getAbsolutePath - - test("read all types of data") { - val readSchema = tableSchema(ALL_TYPES_FILE) - - val actResult = readParquetFilesUsingKernel(ALL_TYPES_FILE, readSchema) - - val expResult = readParquetFilesUsingSpark(ALL_TYPES_FILE, readSchema) - - checkAnswer(actResult, expResult) + Seq( + "parquet-all-types", + "parquet-all-types-legacy-format" + ).foreach { allTypesTableName => + test(s"read all types of data - $allTypesTableName") { + val allTypesFile = goldenTableFile(allTypesTableName).getAbsolutePath + val readSchema = tableSchema(allTypesFile) + + checkAnswer( + readParquetFilesUsingKernel(allTypesFile, readSchema), /* actual */ + readParquetFilesUsingSpark(allTypesFile, readSchema) /* expected */) + } } test("read subset of columns") { + val tablePath = goldenTableFile("parquet-all-types").getAbsolutePath val readSchema = new StructType() .add("byteType", ByteType.BYTE) .add("booleanType", BooleanType.BOOLEAN) @@ -96,14 +100,13 @@ class ParquetFileReaderSuite extends AnyFunSuite .add("ac", new StructType().add("aca", IntegerType.INTEGER))) .add("array_of_prims", new ArrayType(IntegerType.INTEGER, true)) - val actResult = readParquetFilesUsingKernel(ALL_TYPES_FILE, readSchema) - - val expResult = readParquetFilesUsingSpark(ALL_TYPES_FILE, readSchema) - - checkAnswer(actResult, expResult) + checkAnswer( + readParquetFilesUsingKernel(tablePath, readSchema), /* actual */ + readParquetFilesUsingSpark(tablePath, readSchema) /* expected */) } test("read subset of columns with missing columns in file") { + val tablePath = goldenTableFile("parquet-all-types").getAbsolutePath val readSchema = new StructType() .add("booleanType", BooleanType.BOOLEAN) .add("integerType", IntegerType.INTEGER) @@ -114,11 +117,9 @@ class ParquetFileReaderSuite extends AnyFunSuite .add("aa", StringType.STRING) .add("ac", new StructType().add("aca", IntegerType.INTEGER))) - val actResult = readParquetFilesUsingKernel(ALL_TYPES_FILE, readSchema) - - val expResult = readParquetFilesUsingSpark(ALL_TYPES_FILE, readSchema) - - checkAnswer(actResult, expResult) + checkAnswer( + readParquetFilesUsingKernel(tablePath, readSchema), /* actual */ + readParquetFilesUsingSpark(tablePath, readSchema) /* expected */) } test("request row indices") { diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/parquet/ParquetFileWriterSuite.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/parquet/ParquetFileWriterSuite.scala index 488e4a1af41..a1d0e185e0a 100644 --- a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/parquet/ParquetFileWriterSuite.scala +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/parquet/ParquetFileWriterSuite.scala @@ -58,7 +58,7 @@ class ParquetFileWriterSuite extends AnyFunSuite Seq( // Test cases reading and writing all types of data with or without stats collection - Seq((200, 100), (1024, 29), (1048576, 1)).map { + Seq((200, 67), (1024, 17), (1048576, 1)).map { case (targetFileSize, expParquetFileCount) => ( "write all types (no stats)", // test name @@ -103,7 +103,7 @@ class ParquetFileWriterSuite extends AnyFunSuite ) }, // Test cases reading and writing only a subset of data passing a predicate. - Seq((200, 39), (1024, 11), (1048576, 1)).map { + Seq((200, 26), (1024, 7), (1048576, 1)).map { case (targetFileSize, expParquetFileCount) => ( "write filtered all types (no stats)", // test name @@ -118,7 +118,7 @@ class ParquetFileWriterSuite extends AnyFunSuite ) }, // Test cases reading and writing all types of data WITH stats collection - Seq((200, 100), (1024, 29), (1048576, 1)).map { + Seq((200, 67), (1024, 17), (1048576, 1)).map { case (targetFileSize, expParquetFileCount) => ( "write all types (with stats for all leaf-level columns)", // test name @@ -132,7 +132,7 @@ class ParquetFileWriterSuite extends AnyFunSuite ) }, // Test cases reading and writing all types of data with a partial column set stats collection - Seq((200, 100), (1024, 29), (1048576, 1)).map { + Seq((200, 67), (1024, 17), (1048576, 1)).map { case (targetFileSize, expParquetFileCount) => ( "write all types (with stats for a subset of leaf-level columns)", // test name