diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d249aaf..1d8b8bf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -61,6 +61,7 @@ jobs: run: | cd datamancer nimble -y test + nimble -y testJs - name: Build docs if: > diff --git a/README.org b/README.org index 85be412..5967b7a 100644 --- a/README.org +++ b/README.org @@ -1,9 +1,11 @@ - * Datamancer [[https://github.com/SciNim/datamancer/workflows/datamancer%20CI/badge.svg]] [[https://matrix.to/#/#nim-science:envs.net][https://img.shields.io/static/v1?message=join%20chat&color=blue&label=nim-science&logo=matrix&logoColor=gold&style=flat-square&.svg]] [[https://discord.gg/f5hA9UK3dY][https://img.shields.io/discord/371759389889003530?color=blue&label=nim-science&logo=discord&logoColor=gold&style=flat-square&.svg]] +~Datamancer~ is a DataFrame library for Nim, which is heavily inspired +by [[https://dplyr.tidyverse.org/][dplyr]]. + ** Comparison to other dataframe libraries Check out the following gist for a comparison of this library with @@ -11,7 +13,6 @@ dplyr (R) and pandas (Python): https://gist.github.com/Vindaar/6908c038707c7d8293049edb3d204f84 - ** Documentation The documentation is found at: @@ -34,6 +35,19 @@ nimble install datamancer #+END_SRC away. +** Backend targets + +The library supports both Nim's C/C++ backends as well as the +JavaScript target. The latter was added in ~v0.4.2~ and is still +experimental. Certain features are currently not supported (reading +files from disk, reading CSVs from a URL). + +Note also that on older versions than current devel (as of +<2024-02-19 Mon 14:47>) including the current stable, in certain +applications the formula macro ~f{}~ (see below) does not resolve +types in the way it should on the JS backend. You might need to give +explicit type hints in more cases than usual (also see below). + ** Features and formulas The data frame provides the "5 verbs" of [[https://dplyr.tidyverse.org/][dplyr]] and more. Main implemented functions: diff --git a/datamancer.nimble b/datamancer.nimble index e40fb81..d1225ae 100644 --- a/datamancer.nimble +++ b/datamancer.nimble @@ -10,7 +10,7 @@ srcDir = "src" # Dependencies requires "nim >= 1.2.0" -requires "https://github.com/Vindaar/seqmath >= 0.1.11" +requires "https://github.com/Vindaar/seqmath >= 0.2.1" requires "arraymancer >= 0.7.28" task test, "Run standard tests": @@ -21,6 +21,9 @@ task test, "Run standard tests": exec "nim c -r tests/testsFormula.nim" exec "nim c -r tests/testParse.nim" +task testJs, "Run test for JS": + exec "nim js -r tests/testdf_js_simple.nim" + import os, strutils, strformat const pkgName = "datamancer" diff --git a/src/datamancer.nim b/src/datamancer.nim index b453b8f..4cece30 100644 --- a/src/datamancer.nim +++ b/src/datamancer.nim @@ -1,4 +1,8 @@ ## .. include:: ./docs/datamancer.rst -import datamancer / [dataframe, io] -export dataframe, io +when not defined(js): + import datamancer / [dataframe, io] + export dataframe, io +else: + import datamancer / [dataframe, io] + export dataframe, io diff --git a/src/datamancer/column.nim b/src/datamancer/column.nim index 36ce4d0..bf71741 100644 --- a/src/datamancer/column.nim +++ b/src/datamancer/column.nim @@ -1,4 +1,8 @@ -import arraymancer/tensor +when not defined(js): + import arraymancer/tensor +else: + import seq_tensor + import std / [sugar, strformat, tables, macros, strutils] import value @@ -346,14 +350,15 @@ proc toColumn*[C: ColumnLike; T](_: typedesc[C], t: Tensor[T]): C = doAssert false, "This should not happen!" #{.error: "Cannot store " & $T & " in a regular `Column`.".} -proc toColumn*[T: not SupportedTypes](t: openArray[T] | Tensor[T]): auto = - ## Tries to convert the given input data to a matching generic `Column*` - ## type. Errors at CT if there is no matching `Column*` defined so far. - when typeof(t) is Tensor: - let x = t - else: - let x = t.toTensor() - result = colType(T).toColumn(t) +when not defined(js) or (NimMajor, NimMinor, NimPatch) >= (2, 1, 0): + proc toColumn*[T: not SupportedTypes](t: openArray[T] | Tensor[T]): auto = + ## Tries to convert the given input data to a matching generic `Column*` + ## type. Errors at CT if there is no matching `Column*` defined so far. + when typeof(t) is Tensor: + let x = t + else: + let x = t.toTensor() + result = colType(T).toColumn(t) proc toColumn*[C: ColumnLike; T](_: typedesc[C], t: openArray[T]): C = if t.len > 0: diff --git a/src/datamancer/dataframe.nim b/src/datamancer/dataframe.nim index 8214d30..312ac02 100644 --- a/src/datamancer/dataframe.nim +++ b/src/datamancer/dataframe.nim @@ -1,8 +1,12 @@ import std / [macros, tables, strutils, options, sets, hashes, math, sequtils, stats, strformat, algorithm, typetraits] -import arraymancer/tensor -export tensor +when not defined(js): + import arraymancer/tensor + export tensor +else: + import seq_tensor + export seq_tensor import value export value @@ -1983,7 +1987,7 @@ proc innerJoin*[C: ColumnLike](df1, df2: DataTable[C], by: string): DataTable[C] for k in getKeys(result): if result[k].kind != colConstant: # if constant nothing to short withNativeTensor(result[k], t): - result.asgn(k, toColumn(t[_ ..< result.len])) + result.asgn(k, toColumn(t[0 ..< result.len])) result[k].len = result.len proc innerJoin*[C: ColumnLike](dfs: varargs[DataTable[C]], by: string): DataTable[C] = diff --git a/src/datamancer/formulaExp.nim b/src/datamancer/formulaExp.nim index 5a3b4e8..8f12ecb 100644 --- a/src/datamancer/formulaExp.nim +++ b/src/datamancer/formulaExp.nim @@ -709,8 +709,11 @@ proc convertLoop(p: Preface, dtype, fctColResType, loop: NimNode, fnKind: FormulaKind, generateLoop: bool): NimNode = let memCopyable = ["float", "int", "bool"] - let isMemCopyable = dtype.strVal in memCopyable and - p.args.allIt(it.colType.strVal in memCopyable) + when defined(js): + let isMemcopyable = false + else: + let isMemCopyable = dtype.strVal in memCopyable and + p.args.allIt(it.colType.strVal in memCopyable) proc genForLoop(p: Preface, loop: NimNode, fkKind: FormulaKind): NimNode = var mpreface = p let loopIndexed = fixupTensorIndices(loop, mpreface, diff --git a/src/datamancer/io.nim b/src/datamancer/io.nim index 8ebd14c..f344547 100644 --- a/src/datamancer/io.nim +++ b/src/datamancer/io.nim @@ -1,10 +1,10 @@ import dataframe, value, column -import memfiles, streams, strutils, tables, parsecsv, sequtils -# for reading CSV files from URLs -import httpclient -# for `showBrowser` -import browsers, strformat, os +import std / [streams, strutils, tables, parsecsv, sequtils, strformat, os] +when not defined(js): + import memfiles + # for reading CSV files from URLs (former) and `showBrowsers` (latter) + import httpclient, browsers proc checkHeader(s: Stream, fname, header: string, colNames: seq[string]): bool = ## checks whether the given file contains the header `header` @@ -79,16 +79,45 @@ proc readCsv*(s: Stream, result[colHeaders[i]].add parser.rowEntry(col) parser.close() -template copyBuf(data: ptr UncheckedArray[char], buf: var string, + +when defined(js): + type + MemoryView[T] = seq[T] + + proc toMemoryView[T](s: seq[T]): MemoryView[T] = s + proc toMemoryView(s: string): MemoryView[char] = + result = newSeq[char](s.len + 1) + for i, x in s: + result[i] = x + ## To simulate behavior of regular Nim strings on C backend (after accessing via MemoryView) + ## we need a zero byte to correctly behave in `parseNumber` + result[^1] = '\0' +else: + type + MemoryView[T] = ptr UncheckedArray[T] + template address(x: untyped): untyped = + when (NimMajor, NimMinor, NimPatch) <= (2, 0, 0): + unsafeAddr x + else: + addr x + proc toMemoryView[T](s: seq[T]): MemoryView[T] = cast[ptr UncheckedArray[T]](s[0].address) + proc toMemoryView(s: string): MemoryView[char] = cast[ptr UncheckedArray[char]](s[0].address) + proc toMemoryView[T](p: pointer): MemoryView[T] = cast[ptr UncheckedArray[T]](p) + +template copyBuf(data: MemoryView[char], buf: var string, idx, colStart: int): untyped = let nIdx = idx - colStart if nIdx > 0: buf.setLen(nIdx) # will auto reallocate if `len` is larger than capacity! - copyMem(buf[0].addr, data[colStart].addr, nIdx) + when defined(js): + for i in 0 ..< nIdx: + buf[i] = data[colStart + i] + else: + copyMem(buf[0].addr, data[colStart].addr, nIdx) else: buf.setLen(0) -template parseHeaderCol(data: ptr UncheckedArray[char], buf: var string, +template parseHeaderCol(data: MemoryView[char], buf: var string, colNames: var seq[string], header: string, sep, quote: char, idx, colStart: int): untyped = @@ -116,7 +145,7 @@ template parseHeaderCol(data: ptr UncheckedArray[char], buf: var string, else: colNames.add bufStripped -template guessType(data: ptr UncheckedArray[char], buf: var string, +template guessType(data: MemoryView[char], buf: var string, colTypes: var seq[ColKind], col, idx, colStart, numCols, quote: untyped): untyped = # only determine types for as many cols as in header @@ -167,7 +196,7 @@ func normalizeChar(c: char): char = else: c -func tryParse(toEat: seq[char], data: ptr UncheckedArray[char], idx: var int, +func tryParse(toEat: seq[char], data: MemoryView[char], idx: var int, sep: char, retTyp: RetType, retVal: float, floatVal: var float): RetType = ## tries to parse certain strings `NaN`, `Inf` into floats @@ -185,8 +214,7 @@ func tryParse(toEat: seq[char], data: ptr UncheckedArray[char], idx: var int, else: return rtError - -proc parseNumber(data: ptr UncheckedArray[char], +proc parseNumber(data: MemoryView[char], sep, quote: char, # if this sep is found parsing ends idxIn: int, intVal: var int, floatVal: var float): RetType {.inline, noinit.} = @@ -207,7 +235,7 @@ proc parseNumber(data: ptr UncheckedArray[char], intVal = 0 # build intVal up from zero.. if data[idx] in Sign + {quote}: idx.inc # skip optional sign or quote character - while data[idx] != '\0': # ..and track scale/pow10. + while data[idx] != '\0': # ..and track scale/pow10. if data[idx] notin Digits: if data[idx] != '.' or pnt >= 0: break # a second '.' is forbidden @@ -220,6 +248,7 @@ proc parseNumber(data: ptr UncheckedArray[char], p10.inc # any digit moves implicit '.' idx.inc nD.inc + if data[idxIn] == '-': intVal = -intVal # adjust sign @@ -300,7 +329,7 @@ proc parseStringDigit(s: string, quote: char): int = else: raise newException(ValueError, "Input string " & $s & " is not a valid string digit.") -template parseCol(data: ptr UncheckedArray[char], buf: var string, +template parseCol(data: MemoryView[char], buf: var string, col: var Column, sep, quote: char, colTypes: seq[ColKind], colIdx, idx, colStart, row, numCols: int, @@ -381,7 +410,7 @@ template advanceToNextRow() {.dirty.} = if maxLines > 0 and row >= maxLines: break -template parseLine(data: ptr UncheckedArray[char], buf: var string, +template parseLine(data: MemoryView[char], buf: var string, sep, quote, lineBreak, eat: char, col, idx, colStart, row, rowStart: var int, lastWasSep, inQuote: var bool, @@ -433,7 +462,7 @@ proc allColTypesSet(colTypes: seq[ColKind]): bool = ## checks if all column types are determined, i.e. not `colNone` the default result = colTypes.allIt(it != colNone) -proc readCsvTypedImpl(data: ptr UncheckedArray[char], +proc readCsvTypedImpl(data: MemoryView[char], size: int, lineCnt: int, sep: char = ',', @@ -632,112 +661,127 @@ proc parseCsvString*(csvData: string, result = newDataFrame() ## we're dealing with ASCII files, thus each byte can be interpreted as a char - var data = cast[ptr UncheckedArray[char]](csvData[0].unsafeAddr) + var data = toMemoryView(csvData) result = readCsvTypedImpl(data, csvData.len, countNonEmptyLines(csvData), sep, header, skipLines, maxLines, toSkip, colNames, skipInitialSpace, quote, maxGuesses, lineBreak, eat, allowLineBreaks = allowLineBreaks) -proc readCsvFromUrl(url: string, - sep: char = ',', - header: string = "", - skipLines = 0, - maxLines = 0, - toSkip: set[char] = {}, - colNames: seq[string] = @[], - skipInitialSpace = true, - quote = '"' - ): DataFrame = - ## Reads a DF from a web URL (which must contain a CSV file) - var client = newHttpClient() - return parseCsvString(client.getContent(url), sep, header, skipLines, maxLines, toSkip, colNames, - skipInitialSpace, quote) - -proc readCsv*(fname: string, - sep: char = ',', - header: string = "", - skipLines = 0, - maxLines = 0, - toSkip: set[char] = {}, - colNames: seq[string] = @[], - skipInitialSpace = true, - quote = '"', - maxGuesses = 20, - lineBreak = '\n', - eat = '\r', - allowLineBreaks = false - ): DataFrame = - ## Reads a DF from a CSV file or a web URL using the separator character `sep`. - ## - ## `fname` can be a local filename or a web URL. If `fname` starts with - ## "http://" or "https://" the file contents will be read from the selected - ## web server. No caching is performed so if you plan to read from the same - ## URL multiple times it might be best to download the file manually instead. - ## Please note that to download files from https URLs you must compile with - ## the -d:ssl option. - ## - ## `toSkip` can be used to skip optional characters that may be present - ## in the data. For instance if a CSV file is separated by `,`, but contains - ## additional whitespace (`5, 10, 8` instead of `5,10,8`) this can be - ## parsed correctly by setting `toSkip = {' '}`. - ## - ## `header` designates the symbol that defines the header of the CSV file. - ## By default it's empty meaning that the first line will be treated as - ## the header. If a header is given, e.g. `"#"`, this means we will determine - ## the column names from the first line (which has to start with `#`) and - ## skip every line until the first line starting without `#`. - ## - ## `skipLines` is used to skip `N` number of lines at the beginning of the - ## file. - ## - ## `maxLines` is used to stop parsing after this many lines have been parsed. - ## Does not count any `skipLines` or header lines. - ## - ## `colNames` can be used to overwrite (or supply if none in file!) names of the - ## columns in the header. This is also useful if the header is not conforming - ## to the separator of the file. Note: if you `do` supply custom column names, - ## but there `is` a header in the file, make sure to use `skipLines` to skip - ## that header, as we will not try to parse any header information if `colNames` - ## is supplied. - ## - ## `maxGuesses` is the maximum number of rows to look at before we give up - ## trying to determine the datatype of the column and set it to 'object'. - ## - ## `lineBreak` is the character used to detect if a new line starts. `eat` - ## on the other hand is simply ignore. For unix style line endings the defaults - ## are fine. In principle for windows style endings `\r\n` the defaults *should* - ## work as well, but in rare cases the default causes issues with mismatched - ## line counts. In those cases try to switch `lineBreaks` and `eat` around. - ## - ## If `allowLineBreaks` is `true`, line breaks are allowed inside of quoted fields. - ## Otherwise (the default) we raise an exception due to an unexpected number of - ## lines in the file. This is because we perform an initial pass to count the number - ## of lines and wish to err on the side of correctness (rather raise than parse - ## garbage if the file is fully malformed). - ## - ## *NOTE*: If this CSV parser is too brittle for your CSV file, an older, slower - ## parser using `std/parsecsv` is available under the name `readCsvAlt`. However, - ## it does not return a full `DataFrame`. You need to call `toDf` on the result. - if fname.startsWith("http://") or fname.startsWith("https://"): - return readCsvFromUrl(fname, sep=sep, header=header, skipLines=skipLines, - toSkip=toSkip, colNames=colNames) - let fname = fname.expandTilde() - result = newDataFrame() - try: - var ff = memfiles.open(fname) - var lineCnt = 0 - for slice in memSlices(ff, delim = lineBreak, eat = eat): - if slice.size > 0: - inc lineCnt - - ## we're dealing with ASCII files, thus each byte can be interpreted as a char - var data = cast[ptr UncheckedArray[char]](ff.mem) - result = readCsvTypedImpl(data, ff.size, lineCnt, sep, header, skipLines, maxLines, toSkip, colNames, - skipInitialSpace, quote, maxGuesses, lineBreak, eat, - allowLineBreaks = allowLineBreaks) - ff.close() - except OSError: - raise newException(OSError, "Attempt to read CSV file: " & $fname & " failed. No such file or directory.") +when not defined(js): + proc readCsvFromUrl(url: string, + sep: char = ',', + header: string = "", + skipLines = 0, + maxLines = 0, + toSkip: set[char] = {}, + colNames: seq[string] = @[], + skipInitialSpace = true, + quote = '"' + ): DataFrame = + ## Reads a DF from a web URL (which must contain a CSV file) + var client = newHttpClient() + return parseCsvString(client.getContent(url), sep, header, skipLines, maxLines, toSkip, colNames, + skipInitialSpace, quote) + + proc readCsv*(fname: string, + sep: char = ',', + header: string = "", + skipLines = 0, + maxLines = 0, + toSkip: set[char] = {}, + colNames: seq[string] = @[], + skipInitialSpace = true, + quote = '"', + maxGuesses = 20, + lineBreak = '\n', + eat = '\r', + allowLineBreaks = false + ): DataFrame = + ## Reads a DF from a CSV file or a web URL using the separator character `sep`. + ## + ## `fname` can be a local filename or a web URL. If `fname` starts with + ## "http://" or "https://" the file contents will be read from the selected + ## web server. No caching is performed so if you plan to read from the same + ## URL multiple times it might be best to download the file manually instead. + ## Please note that to download files from https URLs you must compile with + ## the -d:ssl option. + ## + ## `toSkip` can be used to skip optional characters that may be present + ## in the data. For instance if a CSV file is separated by `,`, but contains + ## additional whitespace (`5, 10, 8` instead of `5,10,8`) this can be + ## parsed correctly by setting `toSkip = {' '}`. + ## + ## `header` designates the symbol that defines the header of the CSV file. + ## By default it's empty meaning that the first line will be treated as + ## the header. If a header is given, e.g. `"#"`, this means we will determine + ## the column names from the first line (which has to start with `#`) and + ## skip every line until the first line starting without `#`. + ## + ## `skipLines` is used to skip `N` number of lines at the beginning of the + ## file. + ## + ## `maxLines` is used to stop parsing after this many lines have been parsed. + ## Does not count any `skipLines` or header lines. + ## + ## `colNames` can be used to overwrite (or supply if none in file!) names of the + ## columns in the header. This is also useful if the header is not conforming + ## to the separator of the file. Note: if you `do` supply custom column names, + ## but there `is` a header in the file, make sure to use `skipLines` to skip + ## that header, as we will not try to parse any header information if `colNames` + ## is supplied. + ## + ## `maxGuesses` is the maximum number of rows to look at before we give up + ## trying to determine the datatype of the column and set it to 'object'. + ## + ## `lineBreak` is the character used to detect if a new line starts. `eat` + ## on the other hand is simply ignore. For unix style line endings the defaults + ## are fine. In principle for windows style endings `\r\n` the defaults *should* + ## work as well, but in rare cases the default causes issues with mismatched + ## line counts. In those cases try to switch `lineBreaks` and `eat` around. + ## + ## If `allowLineBreaks` is `true`, line breaks are allowed inside of quoted fields. + ## Otherwise (the default) we raise an exception due to an unexpected number of + ## lines in the file. This is because we perform an initial pass to count the number + ## of lines and wish to err on the side of correctness (rather raise than parse + ## garbage if the file is fully malformed). + ## + ## *NOTE*: If this CSV parser is too brittle for your CSV file, an older, slower + ## parser using `std/parsecsv` is available under the name `readCsvAlt`. However, + ## it does not return a full `DataFrame`. You need to call `toDf` on the result. + if fname.startsWith("http://") or fname.startsWith("https://"): + when not defined(js): + return readCsvFromUrl(fname, sep=sep, header=header, skipLines=skipLines, + toSkip=toSkip, colNames=colNames) + else: + raise newException(ValueError, "Cannot perform http request in parseCsv for JS backend at the moment.") + let fname = fname.expandTilde() + result = newDataFrame() + try: + when not defined(js): + var ff = memfiles.open(fname) + var lineCnt = 0 + for slice in memSlices(ff, delim = lineBreak, eat = eat): + if slice.size > 0: + inc lineCnt + ## we're dealing with ASCII files, thus each byte can be interpreted as a char + var data = toMemoryView[char](ff.mem) + let size = ff.size + else: + var ff = open(fname) + var lineCnt = 0 + for slice in lines(ff): + if slice.len > 0: + inc lineCnt + ## we're dealing with ASCII files, thus each byte can be interpreted as a char + var fileDat = fname.readFile() + var data = toMemoryView(fileDat) + let size = data.len + result = readCsvTypedImpl(data, size, lineCnt, sep, header, skipLines, maxLines, toSkip, colNames, + skipInitialSpace, quote, maxGuesses, lineBreak, eat, + allowLineBreaks = allowLineBreaks) + ff.close() + except OSError: + raise newException(OSError, "Attempt to read CSV file: " & $fname & " failed. No such file or directory.") proc readCsvAlt*(fname: string, sep = ',', @@ -847,27 +891,28 @@ proc toHtml*[C: ColumnLike](df: DataTable[C], tmpl = ""): string = body.add "" result = tmpl % (header & body) -proc showBrowser*[C: ColumnLike]( - df: DataTable[C], fname = "df.html", path = getTempDir(), toRemove = false, - htmlTmpl = "") = - ## Displays the given DataFrame as a table in the default browser. - ## - ## `htmlTmpl` can be used as the HTML template of the page on which to print the - ## data frame. It requires two `$#` fields, one for the header of the page and the - ## second for the actual `` body. - ## - ## Note: the HTML generation is not written for speed at this time. For very large - ## dataframes expect bad performance. - let tmpl = if htmlTmpl.len > 0: htmlTmpl else: HtmlTmpl - let fname = path / fname - let page = tmpl % [fname, df.toHtml()] - writeFile(fname, page) - openDefaultBrowser(fname) - if toRemove: - # opening browsers may be slow, so wait a long time before we delete (file still needs to - # be there when the browser is finally open. Thus default is to keep the file - sleep(1000) - removeFile(fname) +when not defined(js): + proc showBrowser*[C: ColumnLike]( + df: DataTable[C], fname = "df.html", path = getTempDir(), toRemove = false, + htmlTmpl = "") = + ## Displays the given DataFrame as a table in the default browser. + ## + ## `htmlTmpl` can be used as the HTML template of the page on which to print the + ## data frame. It requires two `$#` fields, one for the header of the page and the + ## second for the actual `
` body. + ## + ## Note: the HTML generation is not written for speed at this time. For very large + ## dataframes expect bad performance. + let tmpl = if htmlTmpl.len > 0: htmlTmpl else: HtmlTmpl + let fname = path / fname + let page = tmpl % [fname, df.toHtml()] + writeFile(fname, page) + openDefaultBrowser(fname) + if toRemove: + # opening browsers may be slow, so wait a long time before we delete (file still needs to + # be there when the browser is finally open. Thus default is to keep the file + sleep(1000) + removeFile(fname) proc toOrgTable*[C: ColumnLike](df: DataTable[C], precision = 8, emphStrNumber = true): string = ## Converts the given DF to a table formatted in Org syntax. Note that the diff --git a/src/datamancer/seq_tensor.nim b/src/datamancer/seq_tensor.nim new file mode 100644 index 0000000..19029cd --- /dev/null +++ b/src/datamancer/seq_tensor.nim @@ -0,0 +1,63 @@ +## Note: this is for the JS backend only! +## It is a compatibility layer, which simulates the interface of arraymancer tensors +## which we actually use in datamancer +import std / sequtils +import math +export math +import stats +export stats + +type + Tensor*[T] = seq[T] + +proc newTensor*[T](n: int): Tensor[T] = Tensor[T](newSeq[T](n)) +proc newTensorWith*[T](n: int, val: T): Tensor[T] = Tensor[T](newSeqWith(n, val)) +proc newTensorUninit*[T](n: int): Tensor[T] = newTensor[T](n) +#proc `[]=`*[T](t: var Tensor[T], idx: int, val: T) = +# t[idx] = val +proc `[]=`*[T](t: var Tensor[T], idxs: HSlice[int, int], val: T) = + for a in idxs: + t[a] = val + +proc toTensor*[T](x: openArray[T]): Tensor[T] = Tensor[T](@x) +proc toSeq1D*[T](x: Tensor[T]): seq[T] = + result = newSeq[T](x.len) + for i, el in x: + result[i] = el +proc clone*[T](t: Tensor[T]): Tensor[T] = t +proc size*[T](t: Tensor[T]): int = t.len +proc rank*[T](t: Tensor[T]): int = + result = 1 + var tchild: T ## we need to work with a concrete var since T is a typedesc, and can't use t[0] because tensor may not have any concrete elements + when T is Tensor: + result = 1 + rank(tchild) + +proc astype*[T; U](t: Tensor[T], dtype: typedesc[U]): Tensor[U] = t.mapIt(it.dtype) +proc map*[T; U](t: Tensor[T], fn: proc(x: T): U): Tensor[U] = t.mapIt(fn(it)) +template map_inline*(t: untyped, body: untyped): untyped = + type U = typeof(block: + let x {.inject.} = t[0] + let tmp = body + tmp) + + var res = newTensor[U](t.len) + var i = 0 + for x {.inject.} in t: + res[i] = body + inc i + res + +template apply2_inline*(t1, t2: untyped, body: untyped): untyped = + type U = typeof(block: + let x {.inject.} = t1[0] + let y {.inject.} = t2[0] + let tmp = body + tmp) + + doAssert t1.len == t2.len + for i in 0 ..< t1.len: + let x {.inject.} = t1[i] + let y {.inject.} = t2[i] + t1[i] = body + +proc concat*[T](ts: varargs[Tensor[T]], axis: int = 0): Tensor[T] = Tensor[T](sequtils.concat(ts)) diff --git a/tests/testdf_js_simple.nim b/tests/testdf_js_simple.nim new file mode 100644 index 0000000..13b30a1 --- /dev/null +++ b/tests/testdf_js_simple.nim @@ -0,0 +1,2183 @@ +import datamancer, sequtils, math, strutils, streams, sugar, sets, tables +import algorithm +import seqmath +from os import removeFile + +template suite(name, body: untyped): untyped = + block: + body +template test(name, body: untyped): untyped = + block: + body +template check(arg: untyped): untyped = doAssert arg + +when not declared(AssertionDefect): + type AssertionDefect = AssertionError + +suite "Column": + test "Constant columns": + let c = constantColumn(12, 100) + check c.kind == colConstant + check c.len == 100 + check c.cCol == %~ 12 + + for i in 0 ..< c.len: + check c[i, int] == 12 + + test "Column form a scalar": + block Int: + let x = 1 + let c = toColumn x + check c.kind == colInt + check c.toTensor(int) == [1].toTensor + block Float: + let x = 1.0 + let c = toColumn x + check c.kind == colFloat + check c.toTensor(float) == [1.0].toTensor + block String: + let x = "1" + let c = toColumn x + check c.kind == colString + check c.toTensor(string) == ["1"].toTensor + block Bool: + let x = true + let c = toColumn x + check c.kind == colBool + check c.toTensor(bool) == [true].toTensor + + test "Column from scalar Value yields native": + block Int: + let x = %~ 1 + let c = toColumn x + check c.kind == colInt + check c.toTensor(int) == [1].toTensor + block Float: + let x = %~ 1.0 + let c = toColumn x + check c.kind == colFloat + check c.toTensor(float) == [1.0].toTensor + block String: + let x = %~ "1" + let c = toColumn x + check c.kind == colString + check c.toTensor(string) == ["1"].toTensor + block Bool: + let x = %~ true + let c = toColumn x + check c.kind == colBool + check c.toTensor(bool) == [true].toTensor + + test "Adding two equal constant columns": + let c1 = constantColumn(12, 40) + let c2 = constantColumn(12, 60) + check c1.len == 40 + check c1.cCol == %~ 12 + check c2.len == 60 + check c2.cCol == %~ 12 + + let res = add(c1, c2) + check res.kind == colConstant + check res.cCol == %~ 12 + check res.len == 100 + + test "Adding two unequal constant columns of same value type": + let c1 = constantColumn(12, 40) + let c2 = constantColumn(14, 60) + check c1.len == 40 + check c1.cCol == %~ 12 + check c2.len == 60 + check c2.cCol == %~ 14 + + let res = add(c1, c2) + check res.kind == colInt + check res.len == 100 + for i in 0 ..< 100: + if i < 40: + check res[i, int] == 12 + else: + check res[i, int] == 14 + + test "Adding two unequal constant columns of int & float": + let c1 = constantColumn(12, 40) + let c2 = constantColumn(14.0, 60) + check c1.len == 40 + check c1.cCol == %~ 12 + check c2.len == 60 + check c2.cCol == %~ 14.0 + + let res = add(c1, c2) + check res.kind == colFloat + check res.len == 100 + for i in 0 ..< 100: + if i < 40: + check res[i, float] == 12.0 + else: + check res[i, float] == 14.0 + + test "Adding two unequal constant columns of different types": + let c1 = constantColumn(12, 40) + let c2 = constantColumn("foo", 60) + check c1.len == 40 + check c1.cCol == %~ 12 + check c2.len == 60 + check c2.cCol == %~ "foo" + + let res = add(c1, c2) + check res.kind == colObject + check res.len == 100 + for i in 0 ..< 100: + if i < 40: + check res[i, Value] == %~ 12.0 + else: + check res[i, Value] == %~ "foo" + + test "Adding non constant to (equal type) constant column results in native type": + let c1 = constantColumn(5, 5) + let c2 = toColumn [1, 2, 3, 4, 5] + check c1.len == 5 + check c1.cCol == %~ 5 + check c2.len == 5 + check c2.kind == colInt + + let res = add(c1, c2) + check res.kind == colInt + check res.len == 10 + check res[0 ..< 5].toTensor(int) == [5, 5, 5, 5, 5].toTensor + check res[5 ..< 10].toTensor(int) == [1, 2, 3, 4, 5].toTensor + + test "Adding non constant to (compatible type) constant column results in native type": + let c1 = constantColumn(5.0, 5) + let c2 = toColumn [1, 2, 3, 4, 5] + check c1.len == 5 + check c1.cCol == %~ 5.0 + check c2.len == 5 + check c2.kind == colInt + + let res = add(c1, c2) + check res.kind == colFloat + check res.len == 10 + check res[0 ..< 5].toTensor(float) == [5.0, 5.0, 5.0, 5.0, 5.0].toTensor + check res[5 ..< 10].toTensor(float) == [1.0, 2.0, 3.0, 4.0, 5.0].toTensor + + test "Slice assignment to constant column of compatible type leads to native column": + block Single: + var c1 = constantColumn(5, 5) + check c1.len == 5 + check c1.cCol == %~ 5.0 + + c1[3] = 4 + check c1.kind == colInt + check c1.toTensor(int) == [5, 5, 5, 4, 5].toTensor + block Slice: + var c1 = constantColumn(5, 5) + check c1.len == 5 + check c1.cCol == %~ 5.0 + + c1[3 .. 4] = [1, 2].toTensor + check c1.kind == colInt + check c1.toTensor(int) == [5, 5, 5, 1, 2].toTensor + + test "Conversion of constant column results to tensor": + let c = constantColumn(12, 40) + check c.toTensor(0 .. 10, int) == newTensorWith(11, 12) + check c.toTensor(int) == newTensorWith(40, 12) + + test "Lag - lag a tensor by 1 element, fill `default(T)`": + block Int: + let t = [1, 2, 3].toTensor + let exp = t.lag() + check exp.len == t.len + check exp[0] == 0 # default(int) + check exp == [0, 1, 2].toTensor + block Float: + let t = [1.0, 2.0, 3.0].toTensor + let exp = t.lag() + check exp.len == t.len + check exp[0] == 0.0 # default(float) + check exp == [0.0, 1.0, 2.0].toTensor + block String: + let t = ["1", "2", "3"].toTensor + let exp = t.lag() + check exp.len == t.len + check exp[0] == "" # default(string) + check exp == ["", "1", "2"].toTensor + + test "Lag - lag a tensor by 2 element, fill `default(T)`": + block Int: + let t = [1, 2, 3].toTensor + let exp = t.lag(n = 2) + check exp.len == t.len + check exp[0] == 0 # default(int) + check exp[1] == 0 # default(int) + check exp == [0, 0, 1].toTensor + block Float: + let t = [1.0, 2.0, 3.0].toTensor + let exp = t.lag(n = 2) + check exp.len == t.len + check exp[0] == 0.0 # default(float) + check exp[1] == 0.0 # default(float) + check exp == [0.0, 0.0, 1.0].toTensor + block String: + let t = ["1", "2", "3"].toTensor + let exp = t.lag(n = 2) + check exp.len == t.len + check exp[0] == "" # default(string) + check exp[1] == "" # default(string) + check exp == ["", "", "1"].toTensor + + test "Lag - lag a tensor by 1 element, custom fill": + block Int: + let t = [1, 2, 3].toTensor + let exp = t.lag(fill = int.high) + check exp.len == t.len + check exp[0] == int.high # default(int) + check exp == [int.high, 1, 2].toTensor + block Float: + let t = [1.0, 2.0, 3.0].toTensor + let exp = t.lag(fill = NaN) + check exp.len == t.len + check classify(exp[0]) == fcNaN # default(float) + check exp[1 .. 2] == [1.0, 2.0].toTensor + block String: + let t = ["1", "2", "3"].toTensor + let exp = t.lag(fill = "foo") + check exp.len == t.len + check exp[0] == "foo" # default(string) + check exp == ["foo", "1", "2"].toTensor + + test "Lag - lag a column by 1 element, fill `default(T)`": + block Int: + let c = toColumn [1, 2, 3] + let exp = c.lag() + check exp.len == c.len + check exp[0, int] == 0 # default(int) + check exp.toTensor(int) == [0, 1, 2].toTensor + block Float: + let c = toColumn [1.0, 2.0, 3.0] + let exp = c.lag() + check exp.len == c.len + check exp[0, float] == 0.0 # default(float) + check exp.toTensor(float) == [0.0, 1.0, 2.0].toTensor + block String: + let c = toColumn ["1", "2", "3"] + let exp = c.lag() + check exp.len == c.len + check exp[0, string] == "" # default(string) + check exp.toTensor(string) == ["", "1", "2"].toTensor + + test "Lead - lead a tensor by 1 element, fill `default(T)`": + block Int: + let t = [1, 2, 3].toTensor + let exp = t.lead() + check exp.len == t.len + check exp[2] == 0 # default(int) + check exp == [2, 3, 0].toTensor + block Float: + let t = [1.0, 2.0, 3.0].toTensor + let exp = t.lead() + check exp.len == t.len + check exp[2] == 0.0 # default(float) + check exp == [2.0, 3.0, 0.0].toTensor + block String: + let t = ["1", "2", "3"].toTensor + let exp = t.lead() + check exp.len == t.len + check exp[2] == "" # default(string) + check exp == ["2", "3", ""].toTensor + + test "Lead - lead a tensor by 2 element, fill `default(T)`": + block Int: + let t = [1, 2, 3].toTensor + let exp = t.lead(n = 2) + check exp.len == t.len + check exp[1] == 0 # default(int) + check exp[2] == 0 # default(int) + check exp == [3, 0, 0].toTensor + block Float: + let t = [1.0, 2.0, 3.0].toTensor + let exp = t.lead(n = 2) + check exp.len == t.len + check exp[1] == 0.0 # default(float) + check exp[2] == 0.0 # default(float) + check exp == [3.0, 0.0, 0.0].toTensor + block String: + let t = ["1", "2", "3"].toTensor + let exp = t.lead(n = 2) + check exp.len == t.len + check exp[1] == "" # default(string) + check exp[2] == "" # default(string) + check exp == ["3", "", ""].toTensor + + test "Lead - lead a tensor by 1 element, custom fill": + block Int: + let t = [1, 2, 3].toTensor + let exp = t.lead(fill = int.high) + check exp.len == t.len + check exp[2] == int.high # default(int) + check exp == [2, 3, int.high].toTensor + block Float: + let t = [1.0, 2.0, 3.0].toTensor + let exp = t.lead(fill = NaN) + check exp.len == t.len + check classify(exp[2]) == fcNaN # default(float) + check exp[0 .. 1] == [2.0, 3.0].toTensor + block String: + let t = ["1", "2", "3"].toTensor + let exp = t.lead(fill = "foo") + check exp.len == t.len + check exp[2] == "foo" # default(string) + check exp == ["2", "3", "foo"].toTensor + + test "Lead - lead a column by 1 element, fill `default(T)`": + block Int: + let c = toColumn [1, 2, 3] + let exp = c.lead() + check exp.len == c.len + check exp[2, int] == 0 # default(int) + check exp.toTensor(int) == [2, 3, 0].toTensor + block Float: + let c = toColumn [1.0, 2.0, 3.0] + let exp = c.lead() + check exp.len == c.len + check exp[2, float] == 0.0 # default(float) + check exp.toTensor(float) == [2.0, 3.0, 0.0].toTensor + block String: + let c = toColumn ["1", "2", "3"] + let exp = c.lead() + check exp.len == c.len + check exp[2, string] == "" # default(string) + check exp.toTensor(string) == ["2", "3", ""].toTensor + +suite "DataTable parsing": + proc cmpElements[T](s1, s2: seq[T]): bool = + # comparse the two seq, while properly handling `NaN` + result = true + for (x, y) in zip(s1, s2): + when T is float: + if classify(x) == fcNaN xor classify(y) == fcNaN: + return false + elif classify(x) != fcNaN and classify(y) != fcNaN: + if not almostEqual(x, y): return false + # else both NaN + else: + if x != y: return false + + + test "Parsing with inf, NaN": + let exp = """w,x,y,z +1,10,0.1,100 +2,ERR,inf,200 +NaN,N/A,0.3,300 +4,40,0.4,400""" + + let exp2 = exp & "\n" + let exp3 = exp & "\n\n" + let exp4 = exp & "\n\n\n" + + template checkBlock(arg: typed): untyped {.dirty.} = + let df = parseCsvString(arg) + check df["w"].kind == colFloat + check df["x"].kind == colObject # because of invalid floats + check df["y"].kind == colFloat + check df["z"].kind == colInt + check cmpElements(df["w", float].toSeq1D, @[1'f64, 2, NaN, 4]) + check cmpElements(df["x", Value].toSeq1D, @[%~ 10, %~ "ERR", %~ "N/A", %~ 40]) + check cmpElements(df["y", float].toSeq1D, @[0.1, Inf, 0.3, 0.4]) + check cmpElements(df["z", int].toSeq1D, @[100, 200, 300, 400]) + + block NoNewline: + checkBlock(exp) + block OneNewline: + checkBlock(exp2) + block TwoNewlines: + checkBlock(exp3) + block ThreeNewlines: + checkBlock(exp4) + + test "Parsing with newlines after data": + let exp = """x,y,z +1,2,3 +4,5,6 +7,8,9 + + +""" + template checkBlock(): untyped {.dirty.} = + check df["x"].kind == colInt + check df["y"].kind == colInt + check df["z"].kind == colInt + + check df["x", int] == toTensor([1,4,7]) + check df["y", int] == toTensor([2,5,8]) + check df["z", int] == toTensor([3,6,9]) + + block FromString: + let df = parseCsvString(exp) + checkBlock() + #block FromFile: + # let path = "/tmp/test_newlines_datamancer.csv" + # when defined(linux): + # ## XXX: use proper temp handling to check on other OSs + # writeFile(path, exp) + # let df = readCsv(path) + # checkBlock() + # removeFile(path) + + test "Parsing with missing values, float": + let exp = """x,y,z +1,2, +4,,6 +,8,9 +""" + template checkBlock(): untyped {.dirty.} = + check df["x"].kind == colFloat + check df["y"].kind == colFloat + check df["z"].kind == colFloat + check cmpElements(df["x", float].toSeq1D, @[1'f64,4,NaN]) + check cmpElements(df["y", float].toSeq1D, @[2'f64,NaN,8]) + check cmpElements(df["z", float].toSeq1D, @[NaN,6,9]) + + block FromString: + let df = parseCsvString(exp) + checkBlock() + + block FromFile: + let path = "/tmp/test_missing_datamancer.csv" + when defined(linux): + ## XXX: use proper temp handling to check on other OSs + writeFile(path, exp) + let df = readCsv(path) + checkBlock() + removeFile(path) + + test "Parsing with missing values, string": + let exp = """x,y,z +a,2, +aa,3, +b,,foo +,8,bar +""" + template checkBlock(): untyped {.dirty.} = + check df["x"].kind == colString + check df["y"].kind == colFloat + check df["z"].kind == colString + check cmpElements(df["x", string].toSeq1D, @["a", "aa", "b", ""]) + check cmpElements(df["y", float].toSeq1D, @[2'f64,3,NaN,8]) + check cmpElements(df["z", string].toSeq1D, @["","","foo","bar"]) + + block FromString: + let df = parseCsvString(exp) + checkBlock() + + #block FromFile: + # let path = "/tmp/test_missing_string_datamancer.csv" + # when defined(linux): + # ## XXX: use proper temp handling to check on other OSs + # writeFile(path, exp) + # let df = readCsv(path) + # checkBlock() + # removeFile(path) + + test "Parsing with fully column": + ## Reported by @KosKosynsky on the Nim #science channel + let data = """,ID of record,Record number,date_from,date_to,(Name) Additional name if exists,Canal +0,29528,173/MZS/2020,2020/08-04,2021-08-03,,DE +1,29529,113/KEK/1443,2020-08-11,2021-08-10,,DE +2,29530,148/BBK/1527,2020-08-12,2021-08-11,,DE +3,29531,159/ROT/2769,2020-08-13,2021-08-12,,DE +4,29532,745/REZ/3265,2020-08-20,2021-08-19,,DE +5,29533,158/GTK/2144,2020/08-25,2021-0824,,DE +6,29534,151/ZEB/1654,2020-08-28,2021-08-27,,DE +7,29535,158/MTG/6526,2020-08-23,2021-08-22,,DE +""" + let df = parseCsvString(data) + check df.getKeys.len == 7 + check "(Name) Additional name if exists" in df + check "Unnamed0" in df + check df["(Name) Additional name if exists"].kind == colObject + check df["(Name) Additional name if exists", Value][0] == null() + + test "`toHtml`": + let a = [1, 2, 3] + let b = [3, 4, 5] + let c = [4, 5, 6] + let df = toDf(a, b, c) + check df.toHtml == """
+ + + + + + + + + +
Index a

int
b

int
c

int
0134
1245
2356
+""" + +suite "DataTable tests": + + #test "`toDf` is no-op for DF": + # let df = toDf(toDf(readCsv("data/mpg.csv"))) + # check df["class", string] == readCsv("data/mpg.csv")["class", string] + + test "`toDf` works on an OrderedTable[string, seq[string]]": + var tab = initOrderedTable[string, seq[string]]() + tab["x"] = @["1", "2"] + tab["y"] = @["4", "5"] + let df = toDf(tab) + check df["x", int] == [1, 2].toTensor + check df["y", int] == [4, 5].toTensor + + test "`toDf` works on an OrderedTable[string, seq[Value]]": + var tab = initOrderedTable[string, seq[Value]]() + tab["x"] = @[%~ 1, %~ 2] + tab["y"] = @[%~ 4, %~ 5] + let df = toDf(tab) + check df["x", int] == [1, 2].toTensor + check df["y", int] == [4, 5].toTensor + + test "`toDf` works for a single identifier": + let x = @[1, 2, 3] + let df = toDf(x) + check "x" in df + check df["x", int] == [1, 2, 3].toTensor + + test "`toDf` works for multiple identifiers": + let x = @[1, 2, 3] + let y = @[4, 5, 6] + let df = toDf(x, y) + check "x" in df + check df["x", int] == [1, 2, 3].toTensor + check "y" in df + check df["y", int] == [4, 5, 6].toTensor + + test "`toDf` works for a single call": + proc foo(): seq[int] = + result = @[1, 2, 3] + let df = toDf(foo()) + check "foo()" in df + check df["foo()", int] == [1, 2, 3].toTensor + + test "`toDf` works for multiple calls": + proc foo(): seq[int] = + result = @[1, 2, 3] + proc bar(): seq[string] = + result = @["a", "b", "c"] + let df = toDf(foo(), bar()) + check "foo()" in df + check df["foo()", int] == [1, 2, 3].toTensor + check "bar()" in df + check df["bar()", string] == ["a", "b", "c"].toTensor + + test "`toDf` works for a single TableConstr element": + let x = @[1, 2, 3] + let df = toDf({"x" : x}) + check "x" in df + check df["x", int] == [1, 2, 3].toTensor + + test "`toDf` works for multiple TableConstr elements": + let x = @[1, 2, 3] + let y = @[4, 5, 6] + let df = toDf({"x" : x, "y" : y}) + check "x" in df + check df["x", int] == [1, 2, 3].toTensor + check "y" in df + check df["y", int] == [4, 5, 6].toTensor + + test "`toDf` works for a single call in a TableConstr": + proc foo(): seq[int] = + result = @[1, 2, 3] + let df = toDf({"x" : foo()}) + check "x" in df + check df["x", int] == [1, 2, 3].toTensor + + test "`toDf` works for multiple calls in a TableConstr": + proc foo(): seq[int] = + result = @[1, 2, 3] + proc bar(): seq[string] = + result = @["a", "b", "c"] + let df = toDf({"x" : foo(), "y" : bar()}) + check "x" in df + check df["x", int] == [1, 2, 3].toTensor + check "y" in df + check df["y", string] == ["a", "b", "c"].toTensor + + test "`toDf` works in template": + template foo() = + let x = @[1, 2, 3] + let y = @[1, 2, 3] + let df = toDf(x) + check "x" in df + let df2 = toDf(x, y) + check "x" in df2 + check "y" in df2 + foo() + + test "Creation of DFs from seqs": + let a = [1, 2, 3] + let b = [3, 4, 5] + let c = [4, 5, 6] + let d = [8, 9, 10] + # creation directly from a,b,c,d + block: + let df = toDf(a, b, c, d) + check "a" in df + check "b" in df + check "c" in df + check "d" in df + # creation via key / value pairs + block: + let df = toDf({ "one" : a, + "two" : b, + "three" : c, + "four" : d}) + check "one" in df + check "two" in df + check "three" in df + check "four" in df + + #test "Creation of DF w/ int, float other than int64, float64": + # let a = @[123'u8, 12, 55] + # let b = @[1.123'f32, 4.234, 1e12] + # let c = @[1001'i32, 1002, 1003] + # genColumn(uint8) + # #genColumn(uint8, float32, int32) + # var df = seqsToDf({ "a" : a, + # "b" : b }) + # check df["a"].kind == colInt + # check df["b"].kind == colFloat + # check df["a"].toTensor(int) == a.toTensor.asType(int) + # check df["b"].toTensor(float) == b.toTensor.asType(float) + # # check toColumn directly + # df["c"] = toColumn c + # check df["c"].kind == colInt + # check df["c"].toTensor(int) == c.toTensor.asType(int) + + #test "Accessed column of DF is mutable / reference semantics": + # let a = @[123'u8, 12, 55] + # let aRepl = @[123'u8, 12, 33] + # let b = @[1.123'f32, 4.234, 1e12] + # var df = seqsToDf({ "a" : a }) + # check df["a"].kind == colInt + # check df["a"].toTensor(int) == a.toTensor.asType(int) + # df["a"][df.high] = 33 + # check df["a"].kind == colInt + # check df["a"].toTensor(int) == aRepl.toTensor.asType(int) + # df["a"] = b + # check df["a"].kind == colFloat + # check df["a"].toTensor(float) == b.toTensor.asType(float) + # + # # check reference semantics + # let bMod = @[1.123'f32, 4.234, 1e4] + # var colB = df["a"] + # # modifying `colB` modifies `df["a"]` + # colB[df.high] = 1e4 + # check df["a"].toTensor(float) == bMod.toTensor.asType(float) + # + # # modifying underlying tensor modifies data too + # let bMod2 = @[1.123'f32, 4.234, 1e6] + # var tensorB = df["a"].toTensor(float) + # tensorB[df.high] = 1e6 + # check df["a"].toTensor(float) == bMod2.toTensor.asType(float) + + test "toDf with a `bool` column": + # broke ggplotnim CI due to `rPointInPolygon` noticed in https://github.com/Vindaar/ggplotnim/pull/151 + # It was a regression due to the non-generic generics changes + let a = [1, 2, 3] + let b = [true, false, true] + let c = @[false, true, false] + let df = toDf(a, b, c) + check df["a", int].toSeq1D == a + check df["b", bool].toSeq1D == @b + check df["c", bool].toSeq1D == @c + + test "Extending a DF by a column": + let a = [1, 2, 3] + let b = [3, 4, 5] + let c = [4, 5, 6] + let d = [8, 9, 10] + block: + ## NOTE: This "manual" way of adding a column to an existing data frame + ## is sort of "low level" at the moment. What this means is that the + ## size of the given sequence is ``not`` checked at the moment. So take + ## care that you actually hand a sequence of the same length as the DF! + # create DF of the first 3 seqs + var df = toDf({ "one" : a, + "two" : b, + "three" : c }) + check "one" in df + check "two" in df + check "three" in df + check "four" notin df + # and now add fourth manually + df["four"] = d + check "four" in df + + block: + ## This version checks the length and fails if they don't match + # create DF of the first 3 seqs + var df = toDf({ "one" : a, + "two" : b, + "three" : c }) + check "one" in df + check "two" in df + check "three" in df + check "four" notin df + # and now add fourth manually + df["four"] = d + check "four" in df + block: + # check fails if length is longer + let e = [1, 2, 3, 4, 5] + # create DF of the first 3 seqs + var df = toDf({ "one" : a, + "two" : b, + "three" : c }) + check "one" in df + check "two" in df + check "three" in df + check "five" notin df + # and now add fourth manually + #expect(ValueError): + # df["five"] = e + block: + # check fails if length is shorter + let e = [1, 2] + # create DF of the first 3 seqs + var df = toDf({ "one" : a, + "two" : b, + "three" : c }) + check "one" in df + check "two" in df + check "three" in df + check "five" notin df + # and now add last manually + #expect(ValueError): + # df["five"] = e + + block: + # check if we can override existing column + let e = [11, 22, 33] + # create DF of the first 3 seqs + var df = toDf({ "one" : a, + "two" : b, + "three" : c, + "four" : c}) # assign four as `c` + check "one" in df + check "two" in df + check "three" in df + check "four" in df + # check `"four"` is `c` + check df["four"].toTensor(int) == c.toTensor + # assign actual `"four"` + df["four"] = e + # check `"four"` is now `d` + check df["four"].toTensor(int) == e.toTensor + + + test "Testing `bind_rows`": + let a = [1, 2, 3] + let b = [3, 4, 5] + + let c = [4, 5, 6, 7] + let d = [8, 9, 10, 11] + block: + # bind_rows with automatic `ids`, both having same columns + let df = toDf({"a" : a, "b" : b}) + let df2 = toDf({"a" : c, "b" : d}) + let res = bind_rows([df, df2]) + check res["a"].toTensor(int) == concat(a.toTensor(), c.toTensor(), axis = 0) + check res["b"].toTensor(int) == concat(b.toTensor(), d.toTensor(), axis = 0) + # without specifying `id`, no column will be added + #check toSeq(res["id"]) == %~ concat(toSeq(0..= 50 and + idx("x") <= 75}) + check dfFilter["x"].toTensor(int) == toTensor toSeq(50 .. 75) + + test "Filter - comparisons using function": + let x = toSeq(0 .. 100) + let df = toDf(x) + let dfFilter = df.filter(f{float: idx("x") >= max(col("x")) * 0.5}) + check dfFilter["x"].toTensor(int) == toTensor toSeq(50 .. 100) + + test "Filter - data types": + let x = toSeq(0 .. 100) + let df = toDf(x) + let dfFiltered = df.filter(f{float: idx("x") >= max(col("x")) * 0.5}) + check dfFiltered["x"].kind == colInt + let dfReduced1 = df.summarize(f{int: max(col("x"))}) + echo dfReduced1 + check dfReduced1["(max (col x))"].kind == colInt + let dfReduced2 = df.summarize(f{float: max(col("x"))}) + check dfReduced2["(max (col x))"].kind == colFloat + + test "Transmute - float arithmetic": + let x = toSeq(0 ..< 100) + let y = x.mapIt(sin(it.float)) + let y2 = x.mapIt(pow(sin(it.float), 2.0)) + let df = toDf(x, y) + check df.len == 100 + let dfTrans = df.transmute(f{"x"}, f{"y2" ~ idx("y") * idx("y")}) + check "y" notin dfTrans + check "y2" in dfTrans + check "x" in dfTrans + check dfTrans["y2"].toTensor(float) == toTensor y2 + + test "Transmute - parse floats in dataframe from string column": + let x = toSeq(0 ..< 100) + let y = x.mapIt($sin(it.float)) + let yFloat = x.mapIt(sin(it.float)) + let df = toDf(x, y) + check df.len == 100 + let dfTrans = df.transmute(f{"x"}, + f{string -> float: "yFloat" ~ parseFloat(df["y"][idx])}) + check "y" notin dfTrans + check "yFloat" in dfTrans + check "x" in dfTrans + let trans = dfTrans["yFloat"].toTensor(float) + let exp = toTensor yFloat + for i in 0 ..< trans.len: + check almostEqual(trans[i], exp[i]) + + test "Gather - 2 columns": + let x = toSeq(0 ..< 100) + let y1 = x.mapIt(sin(it.float)) + let y2 = x.mapIt(sin(it.float - PI / 2.0) - 0.5) + let yComb = concat(y1, y2) + let df = toDf(x, y1, y2) + check df.len == 100 + let dfLong = df.gather(["y1", "y2"], key = "from", value = "y") + check dfLong.len == 200 + check dfLong["from"].unique.toTensor(string) == toTensor @["y1", "y2"] + check dfLong["y"].toTensor(float) == toTensor(yComb) + let dfY1FromLong = dfLong.filter(f{idx("from") == "y1"}) + let dfY2FromLong = dfLong.filter(f{idx("from") == "y2"}) + check dfY1FromLong["y"].toTensor(float) == df["y1"].toTensor(float) + check dfY2FromLong["y"].toTensor(float) == df["y2"].toTensor(float) + check dfY1FromLong["x"].toTensor(float) == df["x"].toTensor(float) + check dfY2FromLong["x"].toTensor(float) == df["x"].toTensor(float) + + test "Gather - 3 columns": + ## check that it works for 3 columns too + let x = toSeq(0 ..< 100) + let y1 = x.mapIt(sin(it.float)) + let y2 = x.mapIt(sin(it.float - PI / 2.0) - 0.5) + let y3 = x.mapIt(cos(it.float - PI / 2.0) - 0.5) + let yComb = concat(y1, y2, y3) + let df = toDf(x, y1, y2, y3) + check df.len == 100 + let dfLong = df.gather(["y1", "y2", "y3"], key = "from", value = "y") + check dfLong.len == 300 + check dfLong["from"].unique.toTensor(string) == toTensor @["y1", "y2", "y3"] + check dfLong["y"].toTensor(float) == toTensor yComb + let dfY1FromLong = dfLong.filter(f{idx("from") == "y1"}) + let dfY2FromLong = dfLong.filter(f{idx("from") == "y2"}) + let dfY3FromLong = dfLong.filter(f{idx("from") == "y3"}) + check dfY1FromLong["y"].toTensor(float) == toTensor(df["y1"], float) + check dfY2FromLong["y"].toTensor(float) == toTensor(df["y2"], float) + check dfY3FromLong["y"].toTensor(float) == toTensor(df["y3"], float) + check dfY1FromLong["x"].toTensor(float) == toTensor(df["x"], float) + check dfY2FromLong["x"].toTensor(float) == toTensor(df["x"], float) + check dfY3FromLong["x"].toTensor(float) == toTensor(df["x"], float) + + + test "Gather - string and float column": + ## while it may be questionable to combine string and float columns in general + ## it should still work + let x = toSeq(0 ..< 100) + let y1 = x.mapIt(sin(it.float)) + let yStr = x.mapIt($it) + let yComb = concat(%~ y1, %~ yStr) + let df = toDf(x, y1, yStr) + check df.len == 100 + let dfLong = df.gather(["y1", "yStr"], key = "from", value = "y") + check dfLong.len == 200 + check dfLong["from"].unique.toTensor(string) == toTensor @["y1", "yStr"] + check dfLong["y"].toTensor(Value) == toTensor yComb + let dfY1FromLong = dfLong.filter(f{idx("from") == "y1"}) + let dfYSTRFromLong = dfLong.filter(f{idx("from") == "yStr"}) + check dfY1FromLong["y"].toTensor(float) == df["y1"].toTensor(float) + check dfYSTRFromLong["y"].toTensor(string) == df["yStr"].toTensor(string) + check dfY1FromLong["x"].toTensor(float) == df["x"].toTensor(float) + check dfYSTRFromLong["x"].toTensor(float) == df["x"].toTensor(float) + + test "Gather - dropping null values": + ## check that it works for 3 columns too + let x = toSeq(0 ..< 100) + var + y1: seq[float] + y2: seq[Value] + x2s: seq[int] + for i, val in x: + y1.add sin(val.float) + if val mod 3 == 0: + y2.add (%~ (sin(val.float - PI / 2.0) - 0.5)) + x2s.add i + else: + y2.add Value(kind: VNull) + let df = toDf(x, y1, y2) + let gathered = df.gather(["y1", "y2"], dropNulls = false) + let onlyy2 = gathered.filter(f{Value: isNull(df["value"][idx]).toBool == false and + idx("key") == %~ "y2"}) + check onlyy2["x"].toTensor(int) == toTensor x2s + check onlyy2.len == x2s.len + + test "Spread": + #block: + # let df = readCsv("data/fishdata_sparse.csv") + # let dfSpread = df.spread(namesFrom = "station", valuesFrom = "seen") + # let namesExp = concat(df["station"].unique.toTensor(string).toSeq1D, + # @["fish"]).sorted + # check dfSpread.len == 19 + # check dfSpread.getKeys().len == 12 + # check dfSpread.getKeys().sorted == namesExp + # for k in dfSpread.getKeys(): + # check dfSpread[k].kind == colInt + # # easy column to check, all 1 + # check dfSpread["Release", int] == newTensorWith(19, 1) + # ## TODO: support NULL values instead of filling by default T(0) + block: + let data = """ + Type Septem Line Fake ε_cut FractionPass + LineReal false true Real 1 0.2204 + LineFake false true Fake 1 0.8622 + SeptemReal true false Real 1 0.2315 +SeptemLineReal true true Real 1 0.1368 +SeptemLineFake true true Fake 1 0.7255 + SeptemFake true false Fake 1 0.7763 +""" + let df = parseCsvString(data, sep = ' ') + let exp = """ + Type Septem Line ε_cut Real Fake + LineFake false true 1 0 0.8622 + LineReal false true 1 0.2204 0 + SeptemFake true false 1 0 0.7763 +SeptemLineFake true true 1 0 0.7255 +SeptemLineReal true true 1 0.1368 0 + SeptemReal true false 1 0.2315 0 +""" + let dfExp = parseCsvString(exp, sep = ' ') + let dfRes = df.spread("Fake", "FractionPass") + check dfRes.len == 6 + check dfRes.getKeys().len == 6 + check dfRes.getKeys() == dfExp.getKeys() + check equal(dfRes, dfExp) + + test "Pretty printing of DFs": + var + # need the data as two sequences (well actually as a DataTable, but that is + # created most easily from two or more sequences). + x: seq[float] + y: seq[float] + for i in 0 ..< 1000: + let pos = 2 * 3.1415 / 100.0 * i.float + x.add pos + y.add sin(pos) + let df = toDf(x, y) + let defaultExp = """ + Idx x y + dtype: float float + 0 0 0 + 1 0.06283 0.06279 + 2 0.1257 0.1253 + 3 0.1885 0.1874 + 4 0.2513 0.2487 + 5 0.3141 0.309 + 6 0.377 0.3681 + 7 0.4398 0.4258 + 8 0.5026 0.4817 + 9 0.5655 0.5358 + 10 0.6283 0.5878 + 11 0.6911 0.6374 + 12 0.754 0.6845 + 13 0.8168 0.729 + 14 0.8796 0.7705 + 15 0.9425 0.809 + 16 1.005 0.8443 + 17 1.068 0.8763 + 18 1.131 0.9048 + 19 1.194 0.9298 +""" + let dfStr = pretty(df, header = false) + #check dfStr == defaultExp + let expPrecision12 = """ + Idx x y + dtype: float float + 0 0 0 + 1 0.06283 0.062788670114 + 2 0.12566 0.125329556644 + 3 0.18849 0.187375853836 + 4 0.25132 0.248682707741 + 5 0.31415 0.309008182482 + 6 0.37698 0.368114215006 + 7 0.43981 0.425767554563 + 8 0.50264 0.481740683175 + 9 0.56547 0.535812713502 + 10 0.6283 0.587770260526 + 11 0.69113 0.637408283636 + 12 0.75396 0.684530895785 + 13 0.81679 0.728952136516 + 14 0.87962 0.770496705823 + 15 0.94245 0.809000655938 + 16 1.00528 0.844312038323 + 17 1.06811 0.876291503299 + 18 1.13094 0.90481284997 + 19 1.19377 0.929763524249 +""" + let dfPrecision12 = pretty(df, precision = 12, header = false) + #check expPrecision12 == dfPrecision12 + + test "CSV parsing with spaces": + let csvDataStream = newStringStream(""" +t_in_s, C1_in_V, C2_in_V, type +-3.0000E-06, -2.441E-04, -6.836E-04, T1 +-2.9992E-06, 2.441E-04, -6.836E-04 , T1 +-2.9984E-06, 1.025E-03, -8.789E-04 , T1 +-2.9976E-06, 1.025E-03, -2.930E-04 , T1 +-2.9968E-06, 9.277E-04, 2.930E-04 , T2 +-2.9960E-06, 4.395E-04, 4.883E-04 , T2 +-2.9952E-06, 1.465E-04, -2.930E-04 , T2 +-2.9944E-06, -3.418E-04, -1.270E-03, T2 +""") + let csvRead = readCsv(csvDataStream) + let texp = @[-3.0000E-06, -2.9992E-06, -2.9984E-06, -2.9976E-06, -2.9968E-06, + -2.9960E-06, -2.9952E-06, -2.9944E-06] + let c1Exp = @[-2.441E-04, 2.441E-04, 1.025E-03, 1.025E-03, 9.277E-04, 4.395E-04, + 1.465E-04, -3.418E-04] + let c2Exp = @[-6.836E-04, -6.836E-04, -8.789E-04, -2.930E-04, 2.930E-04, + 4.883E-04, -2.930E-04, -1.270E-03] + let typeExp = @["T1", "T1", "T1", "T1", "T2", + "T2", "T2", "T2"] + let dfExp = toDf({ "t_in_s" : texp, "C1_in_V" : c1Exp, "C2_in_V" : c2Exp, + "type" : typeExp}) + let df = toDf(csvRead) + check df["t_in_s"].toTensor(float) == dfExp["t_in_s"].toTensor(float) + check df["C1_in_V"].toTensor(float) == dfExp["C1_in_V"].toTensor(float) + check df["C2_in_V"].toTensor(float) == dfExp["C2_in_V"].toTensor(float) + check df["type"].toTensor(string) == dfExp["type"].toTensor(string) + + #test "CSV parsing of data with unnamed column": + # let df = readCsv("data/03-sample_hugo.csv") + # check df.ncols == 9 + # check "Unnamed0" in df + # check df["Unnamed0", int] == arange(0, 200).toTensor + + test "Summarize": + #let mpg = readCsv("data/mpg.csv") + #block: + # # explicit LHS + # let res = mpg.summarize(f{int: "num" << sum(col("cyl"))}) + # check "num" in res + # check res.len == 1 + # check res["num", 0] == %~ 1378 + # # implicit LHS + # let resImplicit = mpg.summarize(f{int: sum(col("cyl"))}) + # let fname = "(sum cyl)" + # check fname in resImplicit + # check resImplicit.len == 1 + # check resImplicit[fname, 0] == %~ 1378 + #block: + # # explicit LHS + # let res = mpg.summarize(f{float: "mean" << mean(col("cyl"))}) + # check "mean" in res + # check res.len == 1 + # check almostEqual(res["mean", 0].toFloat, 5.888888889) + # # implicit LHS + # let resImplicit = mpg.summarize(f{float: mean(col("cyl"))}) + # let fname = "(mean cyl)" + # check fname in resImplicit + # check resImplicit.len == 1 + # check almostEqual(resImplicit[fname, 0].toFloat, 5.888888889) + #block: + # # summarize multiple groups at the same time + # let res = mpg.group_by(["class", "cyl"]).summarize(f{float: mean(col("hwy"))}) + # check res.len == 19 + # # expected numbers. They seem reasonable, but ``I did NOT`` check them + # # manually!! + # # hence another test below with known numbers and their sum + # let exp = @[24.8, 24.8, 29.47, 29.47, 29, 29, 25.31, 25.31, 29.19, 29.19, 26.26, 26.26, 24, 24, 24, 24, 22.2, 22.2, 20.67, 20.67, 17.9, 17.9, 15.8, 15.8, 30.81, 30.81, 28.5, 28.5, 24.71, 24.71, 21.6, 21.6, 23.75, 23.75, 18.5, 18.5, 16.79, 16.79] + # let resSet = res[$f{float: mean(col("hwy"))}].toTensor(float).map(x => x.round(2)).toHashSet + # let expSet = exp.toHashSet + # check resSet == expSet + block: + # generate numbers + let num = toSeq(1 .. 100) + let numVec = repeat(num, 26).flatten + let sumNum = num.sum + let lab1 = toSeq({'a'..'z'}).mapIt($it) + let lab2 = toSeq({'A'..'Z'}).mapIt($it) + var l1 = newSeq[string]() + var l2 = newSeq[string]() + var count = 0 + for j in 0 ..< lab1.len: + for i in 0 ..< num.len: + l1.add lab1[j] + l2.add lab2[j] + inc count + check count == 2600 + let df = toDf(l1, l2, numVec) + let dfG = df.group_by(["l1", "l2"]).summarize(f{int: sum(col("numVec"))}) + check dfG.len == 26 + check sumNum == 5050 + for el in dfG[$f{int: sum(col("numVec"))}].toTensor(Value): + check el == %~ sumNum + + block: + let df = seqsToDf({"x": @[1, 2, 3, 4, 5], "y": @[5, 10, 15, 20, 25]}) + try: + # fails with `FormulaMismatchError` as there is no reducing proc call in + # the formula body! + echo df.summarize(f{float: `x`}) + except FormulaMismatchError: + discard + + #test "Count": + # # count elements by group. Useful combination of group_by and summarize(len) + # let mpg = readCsv("data/mpg.csv") + # # in manual case the order is not preserved, due to `summarize` impl! + # let exp = toHashSet({6 : 79, 8 : 70, 4 : 81, 5 : 4}) + # block: + # # manually + # let res = mpg.group_by("cyl").summarize(f{int: "num" << col("cyl").len}) + # check "num" in res + # check res.len == 4 + # var resSet = initHashSet[(int, int)]() + # for row in res: + # resSet.incl (row["cyl"].toInt.int, row["num"].toInt.int) + # check resSet == exp + # # using `count` directly + # let resDirect = mpg.count("cyl") + # check "n" in resDirect + # check resDirect.len == 4 + # var resDirectSet = initHashSet[(int, int)]() + # for row in resDirect: + # resDirectSet.incl (row["cyl"].toInt.int, row["n"].toInt.int) + # check resDirectSet == exp + + test "Count - multiple columns": + let df = toDf({ "A" : concat(newSeqWith(30, 1), newSeqWith(30, 2), newSeqWith(40, 3)), + "B" : concat(newSeqWith(20, 5), newSeqWith(50, 6), newSeqWith(30, 7)), + "C" : toSeq(0 ..< 100) }) + let exp = toDf({ "A": [1, 1, 2, 3, 3], "B" : [5, 6, 6, 6, 7], + "n" : [20, 10, 30, 10, 30] }) + + ## Manual using `summarize` + check equal(df.group_by(by=["A", "B"]).summarize(f{int: "n" << len(col("C")) }), exp) + ## First a single `group_by`, then `count` + check equal(df.group_by("A").count("B"), exp) + ## Using multiple columns in `count` + check equal(df.count(["A", "B"]), exp) + + test "isNull": + # tests removal of VNull elements in a column with VNull + let x1 = toSeq(0 .. 100) + let x2 = toSeq(0 .. 10) + let df = toDf(x1, x2) + check df.filter(f{Value: isNull(df["x2"][idx]).toBool == false})["x2"].toTensor(Value) == toTensor (%~ x2) + + test "Unique - duplicates using all columns": + # given some data containing duplicates + let dataDuplStream = newStringStream(""" +t_in_s, C1_in_V, C2_in_V, type +-3.0000E-06, -2.441E-04, -6.836E-04, T1 +-2.9992E-06, 2.441E-04, -6.836E-04 , T1 +-2.9984E-06, 1.025E-03, -8.789E-04 , T1 +-2.9976E-06, 1.025E-03, -2.930E-04 , T1 +-2.9992E-06, 2.441E-04, -6.836E-04 , T1 +-2.9984E-06, 1.025E-03, -8.789E-04 , T1 +-2.9976E-06, 1.025E-03, -2.930E-04 , T1 +-2.9968E-06, 9.277E-04, 2.930E-04 , T2 +""") + let df = toDf(readCsv(dataDuplStream)) + check df.len == 8 + let dfUnique = df.unique + check dfUnique.len == 5 + + test "Unique - duplicates using subset of columns": + let s1 = @[1, 2, 3, 4, 5] + let s2 = @["A", "E", "A", "D", "E"] + let s3 = @["B", "G", "B", "G", "X"] + let df = seqsToDF({ "id" : s1, + "Start" : s2, + "Stop" : s3 }) + check df.len == 5 + let dfUniqueAll = df.unique + check dfUniqueAll.len == 5 + # now only use columns start and stop + let dfUnique = df.unique(["Start", "Stop"]) + check dfUnique.len == 4 + + #test "setDiff": + # # remove duplicates of `mpg` (for some reason there are 9 duplicates..) + # let mpg = readCsv("data/mpg.csv").unique + # let mpgS1 = mpg[0 .. 25] + # let mpgS2 = mpg[20 .. 29] + # block: + # # S1 is primary + # let exp = mpg[0 .. 19].arrange(toSeq(keys(mpg))) + # let res = setDiff(mpgS1, mpgS2).arrange(toSeq(keys(mpg))) + # check exp.len == res.len + # for i in 0 ..< exp.len: + # check row(exp, i) == row(res, i) + # block: + # # S2 is primary + # let exp = mpg[26 .. 29].arrange(toSeq(keys(mpg))) + # let res = setDiff(mpgS2, mpgS1).arrange(toSeq(keys(mpg))) + # check exp.len == res.len + # for i in 0 ..< exp.len: + # check row(exp, i) == row(res, i) + # block: + # # symmetric difference + # let exp = bind_rows(mpg[0 .. 19], mpg[26 .. 29], id = "") + # .arrange(toSeq(keys(mpg))) + # let res = setDiff(mpgS1, mpgS2, symmetric = true).arrange(toSeq(keys(mpg))) + # check exp.len == res.len + # for i in 0 ..< exp.len: + # check row(exp, i) == row(res, i) + + test "Custom column names when reading CSV like data": + # given some data without a header and column names + let data = """ +-3.0000E-06, -2.441E-04, -6.836E-04, T1 +-2.9992E-06, 2.441E-04, -6.836E-04 , T1 +-2.9984E-06, 1.025E-03, -8.789E-04 , T1 +""" + let dataDuplStream = newStringStream(data) + # define columns + let cols = @["V1", "V2", "V3", "Channel"] + block OldParser: + let df = toDf(readCsv(dataDuplStream, colNames = cols)) + check df.len == 3 + check df.getKeys.sorted == cols.sorted + block NewParser: + let df = parseCsvString(data, colNames = cols) + check df.len == 3 + check df.getKeys.sorted == cols.sorted + + test "Column names containing numbers": + # given some data without a header and column names + let data = """ +-3.0000E-06, -2.441E-04, -6.836E-04, T1 +-2.9992E-06, 2.441E-04, -6.836E-04 , T1 +-2.9984E-06, 1.025E-03, -8.789E-04 , T1 +""" + let dataDuplStream = newStringStream(data) + # define columns + let cols = @["0", "1", "2", "3"] + let colsNot = @["\"0\"", "\"1\"", "\"2\"", "\"3\""] + block OldParser: + let df = toDf(readCsv(dataDuplStream, colNames = cols)) + check df.len == 3 + check df.getKeys.sorted == cols.sorted + # redundant but a showcase what happened previously + for k in zip(df.getKeys, colsNot): + check k[0] != k[1] + block NewParser: + let df = parseCsvString(data, colNames = cols) + check df.len == 3 + check df.getKeys.sorted == cols.sorted + # redundant but a showcase what happened previously + for k in zip(df.getKeys, colsNot): + check k[0] != k[1] + + test "Custom column names replacing a real header": + let data = """ + ag, Z=47, (Energy (eV),f1,f2) + 10.0000 -9999.00 1.18566 + 10.1617 -9999.00 1.22941 + 10.3261 -9999.00 1.27478 + 10.4931 -9999.00 1.32182 + 10.6628 -9999.00 1.38215 +""" + let cols = @["Energy", "f1", "f2"] + # note the `skipLines`! Have to skip the real header line! + let df = parseCsvString(data, colNames = cols, sep = ' ', skipLines = 1) + check df.len == 5 + check df.getKeys.sorted == cols.sorted + check df["f1", float].toSeq1D == @[-9999.0, -9999.0, -9999.0, -9999.0, -9999.0] + + test "Parsing space seperated data with spacing at the end of lines": + let data = """ + Energy f1 f2 + 10.0000 -9999.00 1.18566 + 10.1617 -9999.00 1.22941 + 10.3261 -9999.00 1.27478 + 10.4931 -9999.00 1.32182 + 10.6628 -9999.00 1.38215 +""" + let df = parseCsvString(data, sep = ' ') + check df.len == 5 + check df.getKeys.sorted == @["Energy", "f1", "f2"] + check df["f1", float].toSeq1D == @[-9999.0, -9999.0, -9999.0, -9999.0, -9999.0] + + #test "Evaluate data frame using FormulaNode": + # let mpg = readCsv("data/mpg.csv") + # let f = f{`hwy` ~ (`displ` + `cyl` - `cty`)} # this doesn't make sense, but anyways... + # # Displacement + Cylinders - City mpg. Yeah :D + # # use RHS of formula for calculation of 0 row. + # # not exactly possible on arraymancer backend + # check f.evaluate(mpg)[0, Value] == %~ -12.2 + # + # # applying negative column results in expected + # # stringifaction of the formula + # let dfNeg = mpg.clone.transmute(f{-1.0 * idx("hwy")}) + # check "(* -1.0 hwy)" == getKeys(dfNeg)[0] + # + # # negative prefix of existing column results in what we expect + # check evaluate(f{-1.0 * idx("hwy")}, mpg).toTensor(float) == mpg["hwy"].toTensor(float).map(x => -x) + # # evaluate non existant key to vector of constant + # check evaluate(f{"nonExistant"}, mpg).toTensor(string) == toTensor toSeq(0 ..< mpg.len).mapIt("nonExistant") + # # evaluate formula without column on DF + # check evaluate(f{1 + 2}, mpg).toTensor(int) == toTensor toSeq(0 ..< mpg.len).mapIt(3) + + #test "Reduce data frame using FormulaNode": + # let mpg = readCsv("data/mpg.csv") + # # check reduction via a formula and VectorFloatProc + # check almostEqual(reduce(f{float: mean(col("hwy"))}, mpg).toFloat, 23.44017, 1e-3) + # + # # combine with calculation + # check almostEqual(reduce(f{float: 235 / mean(col("hwy"))}, mpg).toFloat, 10.0255, 1e-3) + + test "Allow `add` if first argument is still uninitialized": + # uninitialized data frame (DataTable is ref object) + var df: DataTable[Column] + check df.isNil + let dfToAdd = toDf({ "x" : @[1, 2, 3], + "y" : @[4, 5, 6] }) + df.add dfToAdd + check df == dfToAdd + check dfToAdd["x"].toTensor(int) == [1, 2, 3].toTensor + check dfToAdd["y"].toTensor(int) == [4, 5, 6].toTensor + + test "Inner join - fully qualified": + let idents = @["A", "B", "C", "D"] + let ids = @[1, 2, 3, 4] + let words = @["suggest", "result", "from", "to"] + let df1 = toDf({ "Ident" : idents, + "Ids" : ids}) + let df2 = toDf({ "Ident" : idents, + "Words" : words }) + let dfExp = toDf({ "Ident" : idents, + "Words" : words, + "Ids" : ids }) + let dfRes = df1.innerJoin(df2, by = "Ident") + check dfRes.len == dfExp.len + check dfRes.getKeys == dfExp.getKeys + check dfRes["Ident"].toTensor(string) == dfExp["Ident"].toTensor(string) + check dfRes["Ids"].toTensor(int) == dfExp["Ids"].toTensor(int) + check dfRes["Words"].toTensor(string) == dfExp["Words"].toTensor(string) + + test "Inner join - int & float column": + let idents = @["A", "B", "C", "D"] + let ids = @[1, 2, 3, 4] + let idsFloat = @[1'f64, 2, 3, 4] + let words = @["suggest", "result", "from", "to"] + let df1 = toDf({ "Ident" : idents, + "Ids" : ids}) + let df2 = toDf({ "Ident" : idents, + "Ids" : idsFloat, + "Words" : words}) + let dfExp = toDf({ "Ident" : idents, + "Words" : words, + "Ids" : idsFloat }) + let dfRes = df1.innerJoin(df2, by = "Ident") + check dfRes.len == dfExp.len + check dfRes.getKeys == dfExp.getKeys + check dfRes["Ident"].toTensor(string) == dfExp["Ident"].toTensor(string) + # result has enveloping column kind float + check dfRes["Ids"].kind == colFloat + check dfRes["Ids"].toTensor(float) == dfExp["Ids"].toTensor(float) + check dfRes["Words"].toTensor(string) == dfExp["Words"].toTensor(string) + + test "Inner join - missing elements": + let idents = @["A", "B", "C", "D", "E"] + let ids = @[1, 2, 3, 4, 5] + let idsFloat = @[1'f64, 2, 3, 4] + let words = @["suggest", "result", "from", "to"] + let df1 = toDf({ "Ident" : idents, + "Ids" : ids}) + let df2 = toDf({ "Ident" : idents[0 ..< ^1], + "Ids" : idsFloat, + "Words" : words}) + let dfExp = toDf({ "Ident" : idents[0 ..< ^1], + "Words" : words, + "Ids" : idsFloat }) + let dfRes = df1.innerJoin(df2, by = "Ident") + check dfRes.len == dfExp.len + check dfRes.getKeys == dfExp.getKeys + check dfRes["Ident"].toTensor(string) == dfExp["Ident"].toTensor(string) + # result has enveloping column kind float + check dfRes["Ids"].kind == colFloat + check dfRes["Ids"].toTensor(float) == dfExp["Ids"].toTensor(float) + check dfRes["Words"].toTensor(string) == dfExp["Words"].toTensor(string) + + test "Convert (one typed) object column to native": + let a = @["A", "B", "C", "D", "E"] + let b = @[1, 2, 3, 4, 5] + let c = @[1.1, 1.2, 1.3, 1.5] + let d = @[true, true, false, true] + let aCol = toColumn(%~ a) + let bCol = toColumn(%~ b) + let cCol = toColumn(%~ c) + let dCol = toColumn(%~ d) + check aCol.kind == colObject + check bCol.kind == colObject + check cCol.kind == colObject + check dCol.kind == colObject + check aCol.toNativeColumn.kind == colString + check bCol.toNativeColumn.kind == colInt + check cCol.toNativeColumn.kind == colFloat + check dCol.toNativeColumn.kind == colBool + + test "Convert multi typed (real object) column to native fails": + let a = @["A", "B", "C", "D", "E"] + let b = @[1, 2, 3, 4, 5] + let ab = concat(%~ a, %~ b) + let c = @[1.1, 1.2, 1.3, 1.5] + let d = @[true, true, false, true] + let cd = concat(%~ c, %~ d) + let abCol = toColumn(%~ ab) + let cdCol = toColumn(%~ cd) + check abCol.kind == colObject + check cdCol.kind == colObject + try: + # This actually works because ints can be converted to string! + # that's not really desired behavior, is it? + check abCol.toNativeColumn.kind == colString + check false + except ValueError: + check true + try: + check cdCol.toNativeColumn.kind == colFloat + check false + except ValueError: + check true + + test "Remove 'null' values from DF": + let idents = @["A", "B", "C", "D", "E"] + let ids = @[1, 2, 3, 4, 5] + let ages = @[43, 27, 32, 43] + let cities = @[%~ "NYC", %~ "London", %~ "Sydney", Value(kind: VNull), + %~ "Berlin"] + let df = toDf({ "Ident" : idents, + "Id" : ids, + "Age" : ages, + "City" : cities}) + # now check for: + # -> filter by each individual column + # -> filter by both columns in one call + let dfExp1 = toDf({ "Ident" : idents[0 ..< ^1], + "Id" : ids[0 ..< ^1], + "Age" : ages, + "City" : cities[0 ..< ^1] }) + let dfExp2 = toDf({ "Ident" : @["A", "B", "C", "E"], + "Id" : @[1, 2, 3, 5], + "Age" : @[%~ 43, %~ 27, %~ 32, Value(kind: VNull)], + "City" : cities.filterIt(it.kind != VNull) }) + let dfExp3 = toDf({ "Ident" : @["A", "B", "C"], + "Id" : @[1, 2, 3], + "Age" : %~ @[43, 27, 32], + "City" : %~ cities[0 ..< ^2]}) + block noNativeConversion: + let dfRes1 = df.drop_null("Age") + check dfRes1["Age"].kind == colObject + check dfRes1["City"].kind == colObject + let dfRes2 = df.drop_null("City") + check dfRes2["Age"].kind == colObject + check dfRes2["City"].kind == colObject + let dfRes3 = df.drop_null() + check dfRes3["Age"].kind == colObject + check dfRes3["City"].kind == colObject + let keys = getKeys(df) + for k in keys: + check dfRes1[k].toTensor(Value) == dfExp1[k].toTensor(Value) + check dfRes2[k].toTensor(Value) == dfExp2[k].toTensor(Value) + check dfRes3[k].toTensor(Value) == dfExp3[k].toTensor(Value) + + # convert manually to correct dtypes + check dfRes1["Age"].toNativeColumn.kind == colInt + #expect(ValueError): + # check dfRes1["City"].toNativeColumn.kind == colString + check dfRes1["City"].toNativeColumn(failIfImpossible = false).kind == colObject + + check dfRes2["City"].toNativeColumn.kind == colString + #expect(ValueError): + # check dfRes2["Age"].toNativeColumn.kind == colInt + check dfRes2["Age"].toNativeColumn(failIfImpossible = false).kind == colObject + + check dfRes3["Age"].toNativeColumn.kind == colInt + check dfRes3["City"].toNativeColumn.kind == colString + + block nativeConversion: + let dfRes1 = df.drop_null("Age", convertColumnKind = true) + let dfRes2 = df.drop_null("City", convertColumnKind = true) + let dfRes3 = df.drop_null(convertColumnKind = true) + + # convert manually to correct dtypes + check dfRes1["Age"].kind == colInt + check dfRes1["City"].kind == colObject + + check dfRes2["City"].kind == colString + check dfRes2["Age"].kind == colObject + + check dfRes3["Age"].kind == colInt + check dfRes3["City"].kind == colString + + test "Inplace filter & assign": + let c1 = constantColumn(10, 40) + let c2 = toColumn toSeq(0 ..< 40) + var df = toDf(c1, c2) + df[f{`c2` > 10 and `c2` < 20}, "c2"] = 42 + df[f{`c2` > 20 and `c2` < 30}, "c1"] = 46 + check df.filter(f{`c2` == 42}).len == 9 + check df.filter(f{`c1` == 46}).len == 9 + let data1 = df["c1", int] + let data2 = df["c2", int] + check data1[21 .. 29] == toSeq(0 .. 8).mapIt(46).toTensor() + check data2[11 .. 19] == toSeq(0 .. 8).mapIt(42).toTensor() + + test "Add row to DF (WARNING: very slow!)": + let c1 = constantColumn(10, 10) + let c2 = toColumn toSeq(0 ..< 10).mapIt(it.float) + var df = toDf(c1, c2) + for i in 0 ..< 10: + df.add(i, i.float * 2) + check df.len == 20 + let t1 = df["c1", int] + let t2 = df["c2", float] + check t1 == toTensor toSeq(0 ..< 20).mapIt(if it < 10: 10 else: it - 10) + check t2 == toTensor toSeq(0 ..< 20).mapIt(if it < 10: it.float else: (it.float - 10.0) * 2) + + #test "Mutate/Transmute works on grouped dataframes": + # block Mutate: + # let df = readCsv("data/mpg.csv") + # .group_by("class") + # # for simplicity, we're gonna add the mean of each group to + # .mutate(f{float -> float: "subMeanHwy" ~ 0.0 + mean(df["hwy"])}) + # .arrange("class") + # let expDf = df.group_by("class").summarize(f{float -> float: "subMeanHwy" << mean(col("hwy"))}) + # .arrange("class") + # + # check df.select("subMeanHwy").unique()["subMeanHwy", float] == expDf.select("subMeanHwy")["subMeanHwy", float] + # + # block Transmute: + # let df = readCsv("data/mpg.csv") + # var dfTr = df + # .group_by("class") + # # for simplicity, we're gonna add the mean of each group to + # .transmute(f{float -> float: "subMeanHwy" ~ 0.0 + mean(df["hwy"])}, + # f{"class"}) + # .arrange("class") + # let expDf = df.group_by("class").summarize(f{float -> float: "subMeanHwy" << mean(col("hwy"))}) + # .arrange("class") + # + # check dfTr.select("subMeanHwy").unique()["subMeanHwy", float] == expDf.select("subMeanHwy")["subMeanHwy", float] + + test "Construction with scalar": + var df = toDf({ "x" : @[1,2,3], + "y" : toSeq(5..7), + "z" : "foo", + "α" : 2.5 }) + check df.len == 3 + check df["x", int] == [1,2,3].toTensor + check df["y", int] == [5,6,7].toTensor + echo df + check df["z"].kind == colConstant + check df["α"].kind == colConstant + check df["z"].cCol == %~ "foo" + check df["α"].cCol == %~ 2.5 + df["β"] = 123 + check df["β"].kind == colConstant + check df["β"].cCol == %~ 123 + + test "Index access for DataFrames using `[[i]]` operator": + let df = toDf({"a" : [1, 2], "b" : [3, 4], "c" : [5, 6], "d" : [7, 8]}) + block: + check df[[0]].toTensor(int) == [1, 2].toTensor + check df[[1]].toTensor(int) == [3, 4].toTensor + check df[[2]].toTensor(int) == [5, 6].toTensor + check df[[3]].toTensor(int) == [7, 8].toTensor + block: + try: + discard df[[-1]] + except ValueError: + discard + try: + discard df[[5]] + except ValueError: + discard + + let df2 = toDf({"b" : [3, 4], "a" : [1, 2], "d" : [7, 8], "c" : [5, 6]}) + block: + check df2[[1]].toTensor(int) == [1, 2].toTensor + check df2[[0]].toTensor(int) == [3, 4].toTensor + check df2[[3]].toTensor(int) == [5, 6].toTensor + check df2[[2]].toTensor(int) == [7, 8].toTensor + + test "Select - selecting a column": + let df = toDf({"a" : [1, 2], "b" : [3, 4], "c" : [5, 6], "d" : [7, 8]}) + block: + let res = df.select("a") + check "a" in res + check "b" notin res and "c" notin res and "d" notin res + block: + let res = df.select("b") + check "b" in res + check "a" notin res and "c" notin res and "d" notin res + block: + let res = df.select("d") + check "d" in res + check "a" notin res and "b" notin res and "c" notin res + + test "Select - selecting multiple columns": + let df = toDf({"a" : [1, 2], "b" : [3, 4], "c" : [5, 6], "d" : [7, 8]}) + block: + let res = df.select("a", "b") + check "a" in res + check "b" in res + check "c" notin res and "d" notin res + block: + let res = df.select("b", "d") + check "b" in res + check "d" in res + check "a" notin res and "c" notin res + block: # using array + let res = df.select(["a", "d"]) + check "a" in res + check "d" in res + check "b" notin res and "c" notin res + block: # using seq + let res = df.select(@["a", "d"]) + check "a" in res + check "d" in res + check "b" notin res and "c" notin res + + test "Select - using a Formula": + let df = toDf({"a" : [1, 2], "b" : [3, 4], "c" : [5, 6], "d" : [7, 8]}) + block: + let res = df.select(f{"a"}, f{"B" <- "b"}) # formula & string cannot be mixed + check "a" in res + check "B" in res + check "c" notin res and "d" notin res and "b" notin res + + test "Select - respects order of given keys": + let df = toDf({"a" : [1, 2], "b" : [3, 4], "c" : [5, 6], "d" : [7, 8]}) + block: + let res = df.select("a", "b") + check res[[0]].toTensor(int) == [1, 2].toTensor + check res[[1]].toTensor(int) == [3, 4].toTensor + block: + let res = df.select("b", "a") + check res[[1]].toTensor(int) == [1, 2].toTensor + check res[[0]].toTensor(int) == [3, 4].toTensor + + test "Relocate - relocate a single column": + let df = toDf({"a" : [1, 2], "b" : [3, 4], "c" : [5, 6], "d" : [7, 8]}) + check df[[0]].toTensor(int) == [1, 2].toTensor + check df[[1]].toTensor(int) == [3, 4].toTensor + check df[[2]].toTensor(int) == [5, 6].toTensor + check df[[3]].toTensor(int) == [7, 8].toTensor + block: + let res = df.relocate("a", after = "c") + check res[[2]].toTensor(int) == [1, 2].toTensor + check res[[0]].toTensor(int) == [3, 4].toTensor + check res[[1]].toTensor(int) == [5, 6].toTensor + check res[[3]].toTensor(int) == [7, 8].toTensor + block: + let res = df.relocate("a", after = "d") + check res[[3]].toTensor(int) == [1, 2].toTensor + check res[[0]].toTensor(int) == [3, 4].toTensor + check res[[1]].toTensor(int) == [5, 6].toTensor + check res[[2]].toTensor(int) == [7, 8].toTensor + block: + let res = df.relocate("a", before = "b") + check res[[0]].toTensor(int) == [1, 2].toTensor + check res[[1]].toTensor(int) == [3, 4].toTensor + check res[[2]].toTensor(int) == [5, 6].toTensor + check res[[3]].toTensor(int) == [7, 8].toTensor + block: + let res = df.relocate("c", before = "a") + check res[[1]].toTensor(int) == [1, 2].toTensor + check res[[2]].toTensor(int) == [3, 4].toTensor + check res[[0]].toTensor(int) == [5, 6].toTensor + check res[[3]].toTensor(int) == [7, 8].toTensor + block: + let res = df.relocate(f{"C" <- "c"}, before = "a") + check "C" in res and "c" notin res + check res[[1]].toTensor(int) == [1, 2].toTensor + check res[[2]].toTensor(int) == [3, 4].toTensor + check res[[0]].toTensor(int) == [5, 6].toTensor + check res[[3]].toTensor(int) == [7, 8].toTensor + block: # cannot relocate to itself + try: + let res = df.relocate("a", after = "a") + except KeyError: + discard + + test "Relocate - relocate multiple columns": + let df = toDf({"a" : [1, 2], "b" : [3, 4], "c" : [5, 6], "d" : [7, 8]}) + check df[[0]].toTensor(int) == [1, 2].toTensor + check df[[1]].toTensor(int) == [3, 4].toTensor + check df[[2]].toTensor(int) == [5, 6].toTensor + check df[[3]].toTensor(int) == [7, 8].toTensor + block: # need to hand array for varargs + let res = df.relocate(["b", "c"], after = "d") + check res[[0]].toTensor(int) == [1, 2].toTensor + check res[[2]].toTensor(int) == [3, 4].toTensor + check res[[3]].toTensor(int) == [5, 6].toTensor + check res[[1]].toTensor(int) == [7, 8].toTensor + block: # + let res = df.relocate(["c", "b"], after = "d") + check res[[0]].toTensor(int) == [1, 2].toTensor + check res[[3]].toTensor(int) == [3, 4].toTensor + check res[[2]].toTensor(int) == [5, 6].toTensor + check res[[1]].toTensor(int) == [7, 8].toTensor + + test "Mutate - computing a new column based on two existing": + let df = toDf({ "x" : @[1, 2, 3], "y" : @[10, 11, 12], "z": ["5","6","7"] }) + let dfRes = df.mutate(f{"x+y" ~ `x` + `y`}) + check dfRes.ncols == 4 + check "x+y" in dfRes + check dfRes["x+y", int] == [11,13,15].toTensor + + test "Mutate - computing a new column using a local variable": + let df = toDf({ "x" : @[1, 2, 3], "y" : @[10, 11, 12], "z": ["5","6","7"] }) + # of course local variables can be referenced: + let foo: int = 5 + let dfRes = df.mutate(f{int -> int: "x+foo" ~ `x` + foo}) + check "x+foo" in dfRes + check dfRes["x+foo", int] == [6,7,8].toTensor + + test "Mutate - computing a new column by calling a function": + let df = toDf({ "x" : @[1, 2, 3], "y" : @[10, 11, 12], "z": ["5","6","7"] }) + # they can change type and infer it + let foo = 5 + let dfRes = df.mutate(f{"asInt" ~ parseInt(`z`)}) + check "asInt" in dfRes + check dfRes["asInt", int] == [5,6,7].toTensor + + test "Mutate - computing a new column without an explicit name": + let df = toDf({ "x" : @[1, 2, 3], "y" : @[10, 11, 12], "z": ["5","6","7"] }) + # and if no name is given: + let dfRes = df.mutate(f{`x` + `y`}) + check "(+ x y)" in dfRes + check dfRes["(+ x y)", int] == [11,13,15].toTensor + + test "Mutate - assigning a constant column": + let df = toDf({ "x" : @[1, 2, 3], "y" : @[10, 11, 12], "z": ["5","6","7"] }) + let dfRes = df.mutate( + f{"foo" <- 2}, # generates a constant column with value 2 + f{"bar" <- "x"}, # generates a constant column with value "x", does *not* rename "x" to "bar" + f{"baz" ~ 2} # generates a (non-constant!) column of only values 2 + ) + check dfRes["foo"].kind == colConstant + check dfRes["foo", 0] == %~ 2 + check dfRes["bar"].kind == colConstant + check dfRes["bar", 0] == %~ "x" + check "x" in dfRes # "x" untouched + check dfRes["baz"].kind == colInt # integer column, not constant! + check dfRes["baz", int] == toTensor [2, 2, 2] + +suite "Formulas": + test "Formula containing `if`": + let fn = f{int -> int: if `poopoo` > 5: + `pewpew` + else: + `y`} + check $fn == "(if (elif (> poopoo 5) (pewpew)) (else (y)))" + + let df = toDf({ "poopoo" : @[1,2,7,8], "pewpew" : @[10, 11, 12, 13], + "y" : @[100, 101, 102, 103]}) + check fn.evaluate(df).toTensor(int) == [100, 101, 12, 13].toTensor() + + test "Access using idx()": + let a = [1, 2, 3] + let b = [3, 4, 5] + let c = [4, 5, 6] + let d = [8, 9, 10] + let e = [11, 12, 13] + let df = toDf(a, b, c, d, e) + block: + let dStr = "d" + proc someCall(): string = "e" + let fn1 = f{int -> int: "newCol1" ~ idx("a") + idx(`b`) + idx(c"c") + idx(dStr) + idx(someCall())} + let fn2 = f{int -> int: "newCol2" << max(col("a")) + max(col(`b`)) + max(col(c"c")) + max(col(dStr)) + max(col(someCall()))} + check $fn1 == "newCol1" + check $fn2 == "newCol2" + check fn1.evaluate(df).toTensor(int) == [27, 32, 37].toTensor() + let dfShort = df.summarize(fn2) + check dfShort.len == 1 + check dfShort[$fn2, int][0] == 3 + 5 + 6 + 10 + 13 + block: + proc complProcedure(x: int, s: string, b: seq[int]): int = + result = x + b[0] + ord(s[0]) + + let fn = f{"computeMe" ~ complProcedure(idx("a"), "hello", @[1, 2, 3])} + check fn.evaluate(df).toTensor(int) == @[106, 107, 108].toTensor() + + block: + proc complProcedure(x: int, s: string, b: seq[int], s2: string): int = + result = x + b[0] + ord(s[0]) - ord(s2[0]) + proc anotherCall(): int = 5 + proc moreCalls(x: int): string = $x + + let fn = f{"computeMe" ~ complProcedure(idx("a"), "hello", @[1, 2, 3], anotherCall().moreCalls())} + check fn.evaluate(df).toTensor(int) == @[53, 54, 55].toTensor() + + + test "dplyr / pandas comparison inspired tests": + # some of this functionality was either broken or didn't work before working on + # that dplyr & pandas comparison + let df = toDf({ "A" : concat(newSeqWith(50, "a"), newSeqWith(50, "b")), + "C" : concat(newSeqWith(25, 5), + newSeqWith(25, 15), + newSeqWith(50, 35)), + "B" : toSeq(0 ..< 100) }) + block: + let res = df.group_by("A").summarize(f{int: sum(col("B"))}).filter(f{idx("(sum (col B))") < 2000}) + check res.len == 1 + check res["A", string][0] == "a" + check res["(sum (col B))", int][0] == 1225 + + block: + # now works: + let res = df.group_by("A").filter(f{ sum(col("B")) < 2000}) + check res.len == 50 + check res["B", int] == toSeq(0 ..< 50).toTensor + check res["C", int] == concat(newSeqWith(25, 5), newSeqWith(25, 15)).toTensor + + #block: + # # runtime error: TODO write test! This *could* becoma a CT error in the future. + # expect(FormulaMismatchError): + # discard df.group_by("A").filter(f{ sum(`B`) * 2000}) + + block: + let res = df.group_by(["A", "C"]) + .summarize(f{float: "mean_B" << mean(col("B"))}, + f{int: "sum_B" << sum(col("B"))}, + f{int: "count_B" << col("B").len}) + check res.len == 3 + check res["A", string] == ["a", "a", "b"].toTensor + check res["C", int] == [5, 15, 35].toTensor + check res["mean_B", float] == [12.0, 37.0, 74.5].toTensor + check res["sum_B", int] == [300, 925, 3725].toTensor + check res["count_B", int] == [25, 25, 50].toTensor + + block: + let res = df.group_by(["A", "C"]) + .summarize(f{float: "mean_B" << mean(col("B"))}, + f{float: "sum_B" << sum(col("B"))}, + f{float: "B_first" << col("B")[0]}) + check res.len == 3 + check res["A", string] == ["a", "a", "b"].toTensor + check res["C", int] == [5, 15, 35].toTensor + check res["mean_B", float] == [12.0, 37.0, 74.5].toTensor + check res["sum_B", int] == [300, 925, 3725].toTensor + check res["B_first", int] == [0, 25, 50].toTensor + + block: + let res = df.group_by("A").mutate(f{float: "meanB" << mean(col("B"))}) + check res.len == 100 + check res["meanB", float] == concat(newSeqWith(50, 24.5), newSeqWith(50, 74.5)).toTensor + + #test "Test of idx + mean(col) == mapping operation": + # ## This test is really only to test that the `mutate` formula shown here is + # ## actually compiled correctly into a mapping operation, with or without + # ## user given `~` + # block: + # let df = readCsv("data/mpg.csv") + # .group_by("class") + # .mutate(f{float -> float: "subMeanHwy" ~ `cty` + mean(df["hwy"])}) + # .arrange("class") + # check df.len == 234 + # check df["subMeanHwy", float][0 ..< 5] == [40.8, 39.8, 40.8, 39.8, 39.8].toTensor + # block: + # let df = readCsv("data/mpg.csv") + # .group_by("class") + # .mutate(f{float -> float: `cty` + mean(df["hwy"])}) + # .arrange("class") + # check df.len == 234 + # check df["(+ cty (mean df[\"hwy\"]))", float][0 ..< 5] == [40.8, 39.8, 40.8, 39.8, 39.8].toTensor + + test "Slicing DF with constant column": + var df = toDf({ "Energy" : cycle(linspace(0.0, 24.0, 25), 2), + "Counts" : concat(toSeq(0 ..< 25), + toSeq(0 ..< 25)) }) + df["Type"] = constantColumn("background", df.len) + let dfSlice = df[24 .. 26] + check dfSlice.len == 3 + check dfSlice["Energy", int] == [24, 0, 1].toTensor + check dfSlice["Counts", int] == [24, 0, 1].toTensor + check dfSlice["Type", string] == ["background", "background", "background"].toTensor + + test "Single function call in formula": + proc inRegion(x, y: float, r: string): bool = + result = x > 1 + let df = toDf({"x" : [1,2,3], "y" : [4,5,6]}) + let rad = "foo" + let res = df.filter(f{inRegion(`x`, `y`, rad)}) + check res["x", int] == [2,3].toTensor + + test "Formula referring to bool column": + let df = toDf({"x" : [1,2,3], "y" : [true, false, true]}) + block IsTrue: + let res = df.filter(f{bool: `y`}) + check res["x", int] == [1,3].toTensor + block IsFalse: + let res = df.filter(f{bool: not `y`}) + check res["x", int] == [2].toTensor + +suite "Formulas with object columns using convenience operators": + test "int comparisons": + let df = toDf({"x" : [%~ 1, %~ 2, %~ 3]}) + check df.filter(f{`x` == 1})["x", int] == [1].toTensor + check df.filter(f{`x` != 1})["x", int] == [2,3].toTensor + check df.filter(f{`x` > 1})["x", int] == [2,3].toTensor + check df.filter(f{`x` >= 1})["x", int] == [1,2,3].toTensor + check df.filter(f{`x` < 2})["x", int] == [1].toTensor + + check df.filter(f{1 == `x`})["x", int] == [1].toTensor + check df.filter(f{1 != `x`})["x", int] == [2,3].toTensor + check df.filter(f{1 < `x`})["x", int] == [2,3].toTensor + check df.filter(f{1 <= `x`})["x", int] == [1,2,3].toTensor + check df.filter(f{2 > `x`})["x", int] == [1].toTensor + + test "float comparisons": + let df = toDf({"x" : [%~ 1.0, %~ 2.0, %~ 3.0]}) + check df.filter(f{`x` == 1.0})["x", float] == [1.0].toTensor + check df.filter(f{`x` != 1.0})["x", float] == [2.0,3.0].toTensor + check df.filter(f{`x` > 1.0})["x", float] == [2.0,3.0].toTensor + check df.filter(f{`x` >= 1.0})["x", float] == [1.0,2.0,3.0].toTensor + check df.filter(f{`x` < 2.0})["x", float] == [1.0].toTensor + + check df.filter(f{1.0 == `x`})["x", float] == [1.0].toTensor + check df.filter(f{1.0 != `x`})["x", float] == [2.0,3.0].toTensor + check df.filter(f{1.0 < `x`})["x", float] == [2.0,3.0].toTensor + check df.filter(f{1.0 <= `x`})["x", float] == [1.0,2.0,3.0].toTensor + check df.filter(f{2.0 > `x`})["x", float] == [1.0].toTensor + + test "float comparisons with int Value": + let df = toDf({"x" : [%~ 1, %~ 2, %~ 3]}) + check df.filter(f{`x` == 1.0})["x", int] == [1].toTensor + check df.filter(f{`x` != 1.0})["x", int] == [2,3].toTensor + check df.filter(f{`x` > 1.0})["x", int] == [2,3].toTensor + check df.filter(f{`x` >= 1.0})["x", int] == [1,2,3].toTensor + check df.filter(f{`x` < 2.0})["x", int] == [1].toTensor + + check df.filter(f{1.0 == `x`})["x", int] == [1].toTensor + check df.filter(f{1.0 != `x`})["x", int] == [2,3].toTensor + check df.filter(f{1.0 < `x`})["x", int] == [2,3].toTensor + check df.filter(f{1.0 <= `x`})["x", int] == [1,2,3].toTensor + check df.filter(f{2.0 > `x`})["x", int] == [1].toTensor + + test "bool comparisons": + let df = toDf({"x" : [true, false, true]}) + check df.filter(f{`x` == true})["x", bool] == [true, true].toTensor + check df.filter(f{`x` == false})["x", bool] == [false].toTensor + + check df.filter(f{`x` != true})["x", bool] == [false].toTensor + check df.filter(f{`x` != false})["x", bool] == [true, true].toTensor + + test "string comparisons": + let df = toDf({"x" : ["foo", "bar", "baz"]}) + check df.filter(f{`x` == "foo"})["x", string] == ["foo"].toTensor + check df.filter(f{`x` != "foo"})["x", string] == ["bar", "baz"].toTensor + check df.filter(f{`x` in ["foo", "bar"]})["x", string] == ["foo", "bar"].toTensor + echo df.filter(f{`x` notin ["foo", "bar"]})["x", string] + check df.filter(f{`x` notin ["foo", "bar"]})["x", string] == ["baz"].toTensor + +suite "Formulas with nodes lifted out of body": + ## Ref'd by: `FWLN` in code + ## How do we test this at CT? -> define a procedure that modifies a global counter + ## check only called once + test "Lifting out column operation": + let x = @[1, 2, 3] + let y = @[4, 5, 6] + + # compute new column `z` that uses sum + var counter = 0 + proc mySum(c: Tensor[int]): int = + inc counter + result = c.sum() + + block A: + var df = toDf({"x" : x, "y" : y}) + df = df.mutate(f{int -> int: "z" ~ `x` + mySum(col("y"))}) + check "z" in df + check df["z", int] == @[16, 17, 18].toTensor() + check counter == 1 # and *not* 3 as it was before + + proc mySum2(c: Tensor[int], el: int): int = + inc counter + result = c.sum() + el + + block B: + ## XXX: Add Note that constructs like this one *cannot* be lifted for + ## obvious reasons! + counter = 0 + var df = toDf({"x" : x, "y" : y}) + df = df.mutate(f{int -> int: "z" ~ `x` + mySum2(col("y"), `x`)}) + check "z" in df + check df["z", int] == @[17, 19, 21].toTensor() + check counter == 3 # no lift! + + block C: + ## This can + counter = 0 + var df = toDf({"x" : x, "y" : y}) + df = df.mutate(f{int -> int: "z" ~ `x` + col("y").mySum()}) + check "z" in df + check df["z", int] == @[16, 17, 18].toTensor() + check counter == 1 # and *not* 3 as it was before + + block D: + ## But this cannot, same reason as block B + counter = 0 + var df = toDf({"x" : x, "y" : y}) + df = df.mutate(f{int -> int: "z" ~ `x` + col("y").mySum2(idx("x"))}) + check "z" in df + check df["z", int] == @[17, 19, 21].toTensor() + check counter == 3 + + block E: + ## Do *not* lift a `nnkDotExpr` as first chiled of call containing + ## an `idx(bar)` argument! + type + Foo = object + counter = 0 + proc attenuationCoefficient(x: Foo, el: int): int = + inc counter + result = 1 + let el = Foo() + var df = toDf(x, y) + .mutate(f{int: "μ" ~ el.attenuationCoefficient(idx("x"))}) + check "μ" in df + check counter == 3 + check df["μ", int] == @[1, 1, 1].toTensor() + block F: + ## *Do* lift a `nnkDotExpr` as first chiled of call containing + ## a `col(bar)` argument! + type + Foo = object + counter = 0 + proc attenuationCoefficient(x: Foo, el: Tensor[int]): int = + inc counter + result = sum(el) + let el = Foo() + var df = toDf(x, y) + .mutate(f{int: "μ" ~ el.attenuationCoefficient(col("x"))}) + check "μ" in df + check counter == 1 + check df["μ", int] == @[6, 6, 6].toTensor()