Skip to content

Commit

Permalink
cygpath not required to be in PATH unless non-standard cygpath instal…
Browse files Browse the repository at this point in the history
…l location
  • Loading branch information
philwalk committed Dec 26, 2024
2 parents 321870e + 69e6c8c commit b93ed49
Show file tree
Hide file tree
Showing 5 changed files with 217 additions and 31 deletions.
15 changes: 8 additions & 7 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ lazy val supportedScalaVersions = List(scala3)
javacOptions ++= Seq("-source", "11", "-target", "11")

//enablePlugins(ScalaNativePlugin)
//nativeLinkStubs := true

//ThisBuild / envFileName := "dev.env" // sbt-dotenv plugin gets build environment here
ThisBuild / scalaVersion := scalaVer
Expand Down Expand Up @@ -53,15 +54,15 @@ ThisBuild / publishTo := {

ThisBuild / publishMavenStyle.withRank(KeyRanks.Invisible) := true

ThisBuild / crossScalaVersions := supportedScalaVersions

// For all Sonatype accounts created on or after February 2021
ThisBuild / sonatypeCredentialHost := "s01.oss.sonatype.org"

resolvers += Resolver.mavenLocal

publishTo := sonatypePublishToBundle.value

ThisBuild / crossScalaVersions := supportedScalaVersions

Compile / packageBin / packageOptions +=
Package.ManifestAttributes(java.util.jar.Attributes.Name.CLASS_PATH -> "")

Expand All @@ -77,11 +78,11 @@ lazy val root = (project in file(".")).
)

libraryDependencies ++= Seq(
"org.scalatest" %% "scalatest" % "3.2.19" % Test,
//"com.github.sbt" % "junit-interface" % "0.13.3" % Test,
"org.simpleflatmapper" % "sfm-csv" % "9.0.2",
"io.github.chronoscala" %% "chronoscala" % "2.0.10",
"org.vastblue" % "unifile_3" % "0.3.6",
"org.scalatest" %% "scalatest" % "3.2.19" % Test,
"org.vastblue" % "unifile_3" % "0.3.6",
"org.simpleflatmapper" % "sfm-csv" % "9.0.2",
"com.github.tototoshi" %% "scala-csv" % "2.0.0",
"io.github.chronoscala" %% "chronoscala" % "2.0.10",
)

/*
Expand Down
2 changes: 1 addition & 1 deletion project/build.sbt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
ThisBuild / scalaVersion := "2.12.19"
ThisBuild / scalaVersion := "2.12.20"
2 changes: 1 addition & 1 deletion project/plugins.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ scalaVersion := "2.12.20"

val SONATYPE_VERSION = sys.env.getOrElse("SONATYPE_VERSION", "3.12.2") // "3.10.0")

//addSbtPlugin("org.scala-native" % "sbt-scala-native" % "0.4.17")
//addSbtPlugin("org.scala-native" % "sbt-scala-native" % "0.5.3") // "0.4.17")

addSbtPlugin("ch.epfl.scala" % "sbt-bloop" % "2.0.6")
addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.13.0")
Expand Down
53 changes: 31 additions & 22 deletions src/main/scala/vastblue/file/FastCsv.scala
Original file line number Diff line number Diff line change
Expand Up @@ -8,36 +8,18 @@ import org.simpleflatmapper.csv.*
import java.io.{FileNotFoundException, Reader, StringReader, File as JFile}
import java.nio.file.{Path, Files as JFiles, Paths as JPaths}
import scala.jdk.CollectionConverters.*
import scala.collection.immutable.ArraySeq

/**
* Csv Parser based on simpleflatmapper.
* (replaces SimpleCsv)
*/
object FastCsv {

// TODO: verify that this does not process more than the first line of the input String
def parseLine(str: String): List[String] = parseCsvLine(str) // alias
def parseCsvLine(str: String): List[String] = {
parseCsvStream(str).toList match {
case cols :: tail =>
cols.toList
case Nil =>
Nil
}
}
def parseCsvStream(str: String): Iterator[List[String]] = {
apply(str).iterator.map { _.toList }
}
def parseFile(infile: Path): FastCsv = {
FastCsv(infile, ",")
}
// def parseCsvFile(infile: Path): FastCsv = { // alias
// parseFile(infile)
// }

def apply(jfile: JFile, _delimiter: String): FastCsv = {
apply(jfile.toPath, _delimiter)
}

def apply(p: Path, delimiter: String = ""): FastCsv = {
if (!p.isFile) {
throw new java.nio.file.NoSuchFileException(s"${p.posx}")
Expand All @@ -49,10 +31,35 @@ object FastCsv {
val reader = new StringReader(str)
new FastCsv(reader, p.toString, aDelimiter)
}

def apply(content: String): FastCsv = {
new FastCsv(new StringReader(content), s"${content.take(10)}...", ",")
}

// TODO: verify that this does not process more than the first line of the input String
def parseLine(str: String): ArraySeq[String] = parseCsvLine(str) // alias

def parseCsvLine(str: String): ArraySeq[String] = {
parseCsvStream(str) match {
case iter if iter.hasNext =>
iter.next()
case _ =>
ArraySeq.empty[String]
}
}

def parseCsvStream(str: String): Iterator[ArraySeq[String]] = {
val fastCsv = apply(str)
fastCsv.iterator.map(identity)
}

def parseFile(infile: Path): FastCsv = {
FastCsv(infile, ",")
}
// def parseCsvFile(infile: Path): FastCsv = { // alias
// parseFile(infile)
// }

/* will not quit on error unless override ignoreErrors = false */
def autoDetectDelimiter(sampleText: String, fname: String, ignoreErrors: Boolean = true): String = {
var (tabs, commas, semis, pipes) = (0, 0, 0, 0)
Expand Down Expand Up @@ -103,12 +110,14 @@ case class FastCsv(val reader: Reader, identifier: String, delimiter: String) {
case ";" => ';'
case _ => delimiter.charAt(0)
}
def iterator: Iterator[Seq[String]] = CsvParser.separator(delim).iterator(reader).asScala.map { _.toSeq }

def rawrows: Seq[Seq[String]] = iterator.toSeq.filter { (cols: Seq[String]) => cols != Seq("") } // discard gratuitous empty rows
def rawrows: Seq[Seq[String]] = iterator.toSeq.filter { (cols: ArraySeq[String]) => cols != Seq("") } // discard gratuitous empty rows
def rows = rawrows.map { row => row.map(_.trim) }
def rowstrimmed = rows

// def stream = CsvParser.separator(delim).iterator(reader).asScala.iterator
override def toString = identifier

import org.simpleflatmapper.csv.*
def iterator: Iterator[ArraySeq[String]] = CsvParser.separator(delim).iterator(reader).asScala.map { ArraySeq.unsafeWrapArray(_) }
}
176 changes: 176 additions & 0 deletions src/main/scala/vastblue/file/FastCsvTototoshi.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
//#!/usr/bin/env -S scala -explain
package vastblue.file

import vastblue.pallet.*
import vastblue.file.Util.*
import com.github.tototoshi.csv.*

import java.io.{FileNotFoundException, Reader, StringReader, File as JFile}
import java.nio.file.{Path, Files as JFiles, Paths as JPaths}
import scala.jdk.CollectionConverters.*

/**
* Csv Parser based on simpleflatmapper.
* (replaces SimpleCsv)
*/
object FastCsvToto {

def apply(jfile: JFile, _delimiter: String): FastCsvToto = {
apply(jfile.toPath, _delimiter)
}

def apply(p: Path, delimiter: String = ""): FastCsvToto = {
if (!p.isFile) {
throw new java.nio.file.NoSuchFileException(s"${p.posx}")
}
val lines = readLines(p)
def autoDelimiter = autoDetectDelimiter(lines.take(100).mkString("\n"), p.toString, ignoreErrors = false)
val aDelimiter = if (delimiter.nonEmpty) delimiter else autoDelimiter
val str = p.contentAsString
val reader = new StringReader(str)
new FastCsvToto(reader, p.toString, aDelimiter)
}

def apply(content: String): FastCsvToto = {
new FastCsvToto(new StringReader(content), s"${content.take(10)}...", ",")
}

// TODO: verify that this does not process more than the first line of the input String
def parseLine(str: String): List[String] = parseCsvLine(str) // alias

def parseCsvLine(str: String): List[String] = {
parseCsvStream(str).toList match {
case cols :: tail =>
cols.toList
case Nil =>
Nil
}
}

def parseCsvStream(str: String): Iterator[List[String]] = {
apply(str).iterator.map { _.toList }
}

def parseFile(infile: Path): FastCsvToto = {
FastCsvToto(infile, ",")
}
// def parseCsvFile(infile: Path): FastCsvToto = { // alias
// parseFile(infile)
// }

/* will not quit on error unless override ignoreErrors = false */
def autoDetectDelimiter(sampleText: String, fname: String, ignoreErrors: Boolean = true): String = {
var (tabs, commas, semis, pipes) = (0, 0, 0, 0)
sampleText.toCharArray.foreach {
case '\t' => tabs += 1
case ',' => commas += 1
case ';' => semis += 1
case '|' => pipes += 1
case _ =>
}
// Premise:
// tab-delimited files contain more tabs than commas,
// comma-delimited files contain more commas than tabs.
// Provides a reasonably fast guess, but can potentially fail.
//
// A much slower but more thorough approach would be:
// 1. replaceAll("""(?m)"[^"]*", "") // remove quoted strings
// 2. split("[\r\n]+") // extract multiple lines
// 3. count columns-per-row tallies using various delimiters
// 4. the tally with the most consistency is the "winner"
(commas, tabs, pipes, semis) match {
// in case of a tie between commas and tabs, commas win (TODO: configurable)
case (cms, tbs, pps, sms) if cms >= tbs && cms >= pps && cms >= sms => ","
case (cms, tbs, pps, sms) if tbs >= cms && tbs >= pps && tbs >= sms => "\t"
case (cms, tbs, pps, sms) if pps > cms && pps > tbs && pps > sms => "|"
case (cms, tbs, pps, sms) if sms > cms && sms > tbs && sms > pps => ";"

case _ if ignoreErrors => ""

case _ =>
sys.error(
s"unable to choose delimiter: tabs[$tabs], commas[$commas], semis[$semis], pipes[$pipes] for file:\n[${fname}]"
)
}
}
}

case class FastCsvToto(val reader: Reader, identifier: String, delimiter: String) {
if (delimiter.length != 1) {
System.err.printf("warning: only sees the first character of the delimiter [%s]\n", delimiter)
}

def delim: Char = delimiter match {
case "" => ' ' // treat rows with no delimiter as a single column
case "," => ','
case "\t" => '\t'
case "|" => '|'
case ";" => ';'
case _ => delimiter.charAt(0)
}

def rawrows: Seq[Seq[String]] = iterator.toSeq.filter { (cols: Seq[String]) => cols != Seq("") } // discard gratuitous empty rows
def rows = rawrows.map { row => row.map(_.trim) }
def rowstrimmed = rows

// def stream = CsvParser.separator(delim).iterator(reader).asScala.iterator
override def toString = identifier

import java.io.BufferedReader
import scala.util.Using
val br: BufferedReader = new BufferedReader(reader)

inline def iterateLines: Iterator[String] = Iterator.continually(readLine).takeWhile { _ != null }

class csvFormat extends CSVFormat {
val delimiter: Char = delim
val quoteChar: Char = '"'
val escapeChar: Char = '"'
val lineTerminator: String = "\n" // only used by tototoshi CSVWriter
val quoting: Quoting = QUOTE_MINIMAL
val treatEmptyLineAsNil: Boolean = false
}

lazy val csvParser = new CSVParser(new csvFormat)

inline def iterator: Iterator[Seq[String]] = {
for {
line <- iterateLines
// cols = CSVParser.parse(line, escapeChar, delimiterChar, quoteChar) match {
colsopt = csvParser.parseLine(line)
if colsopt != None
} yield colsopt.get
}

inline def readLine: String = {
val sb = new StringBuilder()
var c: Int = 0
def cc: Char = c.asInstanceOf[Char]
def nonEOL: Boolean = c != -1 && c != '\n' && c != '\u2028' && c != '\u2029' && c != '\u0085'

while (nonEOL) {
c = br.read()
if (c != -1) {
sb.append(cc)
if (nonEOL) {
if (c == '\r') {
br.mark(1)
c = br.read()
if (c != -1) {
if (c == '\n') {
sb.append('\n')
} else {
br.reset()
}
}
}
}
}
}
if (sb.isEmpty) {
null
} else {
sb.toString()
}
}
}

0 comments on commit b93ed49

Please sign in to comment.