Skip to content

Latest commit

 

History

History
367 lines (325 loc) · 12.1 KB

test_ps2org.org

File metadata and controls

367 lines (325 loc) · 12.1 KB

To convert Pandas dataframe to tabular format using tabulate()

import pandas as pd
df = pd.DataFrame({
    "a": [1,2,3],
    "b": [4,5,6]
})
df_str = tabulate(df, headers=df.columns, tablefmt="orgtbl", showindex=False)
print(df_str)
ab
14
25
36

To convert Pandas dataframe to tabular format using pd2org()

import pandas as pd
df = pd.DataFrame({
    "a": [1,2,3],
    "b": [4,5,6]
})
<<pd2org("df")>>
ab
14
25
36

To convert PySpark dataframe to tabular format using ps2org()

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("test-app").getOrCreate()
schema = T.StructType(
    [
        T.StructField("a", T.IntegerType(), True),
        T.StructField("b", T.IntegerType(), True),
    ]
)
data = [(1, 4), (2, 5), (3, 6)]
df = spark.createDataFrame(schema=schema, data=data)
<<ps2org("df")>>
ab
14
25
36

To convert PySpark dataframe to tabular format using actual and shown code

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
spark = SparkSession.builder.config("spark.log.level", "OFF").master("local").appName("test-app").getOrCreate()
<<nostderr("spark")>>

schema = T.StructType(
    [
        T.StructField("a", T.IntegerType(), True),
        T.StructField("b", T.IntegerType(), True),
    ]
)
data = [(1, 4), (2, 5), (3, 6)]
df = spark.createDataFrame(schema=schema, data=data)
print("Dataframe df:")
<<show2org("df")>>df.show()
ab
14
25
36

Which is converted into the following code block during evaluation:

from tabulate import tabulate
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("test-app").getOrCreate()
schema = T.StructType(
    [
        T.StructField("a", T.IntegerType(), True),
        T.StructField("b", T.IntegerType(), True),
    ]
)
data = [(1, 4), (2, 5), (3, 6)]
df = spark.createDataFrame(schema=schema, data=data)
print(df.toPandas().to_markdown(index=False, tablefmt='orgtbl'))#df.show()

To convert PySpark dataframe to tabular format using returned value and NOWEB

# Built-in namespace
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("test-app").getOrCreate()
schema = T.StructType(
    [
        T.StructField("a", T.IntegerType(), True),
        T.StructField("b", T.IntegerType(), True),
    ]
)
data = [(1, 4), (2, 5), (3, 6)]
df = spark.createDataFrame(schema=schema, data=data)
df<<litps2org>>.show()
|   a |   b |
|-----+-----|
|   1 |   4 |
|   2 |   5 |
|   3 |   6 |

To convert PySpark dataframe to tabular format using post-processing with AWK

echo "$data"  | awk 'BEGIN{state_prev=""; prev_line=""}{                          \
              if ($0 ~ /^\+[-+]+\+$/){                                            \
                     state_curr = "hline"                                         \
              } else {                                                            \
                     if ($0 ~ /^\|.*\|$/) {                                       \
                          state_curr = "tblbody"                                  \
                      }                                                           \
                      else {                                                      \
                          state_curr = "txt"                                      \
                      }                                                           \
               }                                                                  \
                                                                                  \
              if ((state_curr == "hline") && (state_prev == "txt")) {             \
                     printf("%s", prev_line);                                     \
                     prev_line = "";                                              \
              } else if ((state_curr == "txt") && (state_prev == "hline")) {      \
                     prev_line = $0;                                              \
              } else if ((state_curr == "hline") && (state_prev == "")) {         \
                     prev_line = "";                                              \
              } else if ((state_curr == "txt") && (state_prev == "")) {           \
                     printf("%s", prev_line);                                     \
                     prev_line = gensub(/^\+([-+]+)\+$/, "|\\1|", "g", $0);       \
              } else {                                                            \
                     if (NR > 2) {                                                \
                          printf("%s\n", prev_line);                              \
                     }                                                            \
                     prev_line = gensub(/^\+([-+]+)\+$/, "|\\1|", "g", $0);       \
              }                                                                   \
              state_prev = state_curr;                                            \
              }END{if (prev_line !~ /^\|.*\|$/) {print prev_line}}'
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession
from tabulate import tabulate
spark = SparkSession.builder.master("local[1]").appName("test-app").getOrCreate()
schema = T.StructType(
    [
        T.StructField("A", T.ArrayType(T.StringType()), True),
        T.StructField("B", T.ArrayType(T.StringType()), True),
    ]
)
data = [(["b", "a", "c"], ["c", "d", "a", "f"])]
df = spark.createDataFrame(schema=schema, data=data)

dft = df.select("A", "B",
          F.array_except("A", "B").alias("A\B"),
          F.array_except("B", "A").alias("B\A"))
print("Table 1:")
dft.show()

print("Table 2:")
dft.show()

print("Two tables are the same.")
ABA\BB\A
[b, a, c][c, d, a, f][b][d, f]

Table 2:

ABA\BB\A
[b, a, c][c, d, a, f][b][d, f]

Two tables are the same.

To convert PySpark dataframe to tabular format using post-processing with SED

echo "$data" | sed -E "s/^\+([-+]+)\+$/|\1|/g"
ABA\BB\A
[b, a, c][c, d, a, f][b][d, f]

Table 2:

ABA\BB\A
[b, a, c][c, d, a, f][b][d, f]

Two tables are the same.

To convert PySpark dataframe to tabular format using post-processing with Python

The formatting of PySpark dataframe is done in .showString().

import re
state_prev = ""
prev_line = ""

for j, line in enumerate(data.split("\n")):
    if re.match("^\+[-+]+\+$", line):
        state_curr = "hline"
    elif re.match("^\|.*\|$", line):
        state_curr = "tblbody"
    else:
        state_curr = "txt"

    if (state_curr == "hline") & (state_prev == "txt"):
        print(prev_line, end="")
        prev_line = ""
    elif (state_curr == "txt") & (state_prev == "hline"):
        print("", end="")
        prev_line = line
    elif (state_curr == "txt") & (state_prev == ""):
        print(prev_line, end="")
        prev_line = re.sub("^\+([-+]+)\+$", "|\\1|", line)
    else:
        if j > 0:
            print(prev_line, end="\n")
        prev_line = re.sub("^\+([-+]+)\+$", "|\\1|", line)

    state_prev = state_curr

if not re.match("^\|[-+]+\|$", prev_line):
    print(prev_line)
ABA\BB\A
[b, a, c][c, d, a, f][b][d, f]

Table 2:

ABA\BB\A
[b, a, c][c, d, a, f][b][d, f]

Two tables are the same.

To convert PySpark dataframe to HTML format using a built-in function

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession
from pyspark import SparkConf

# This configuration is needed to enable HTML rendering
conf = SparkConf().set("spark.sql.repl.eagerEval.enabled", "true")

spark = SparkSession.builder.master("local[1]").appName("test-app").config(conf=conf).getOrCreate()
schema = T.StructType(
    [
        T.StructField("a", T.IntegerType(), True),
        T.StructField("b", T.IntegerType(), True),
    ]
)
data = [(1, 4), (2, 5), (3, 6)]
df = spark.createDataFrame(schema=schema, data=data)
print(df._repr_html_())