diff --git a/modules/common/src/main/scala/docspell/common/SystemCommand.scala b/modules/common/src/main/scala/docspell/common/SystemCommand.scala deleted file mode 100644 index 702c546db4..0000000000 --- a/modules/common/src/main/scala/docspell/common/SystemCommand.scala +++ /dev/null @@ -1,212 +0,0 @@ -/* - * Copyright 2020 Eike K. & Contributors - * - * SPDX-License-Identifier: AGPL-3.0-or-later - */ - -package docspell.common - -import java.io.InputStream -import java.lang.ProcessBuilder.Redirect -import java.util.concurrent.TimeUnit - -import scala.jdk.CollectionConverters._ - -import cats.effect._ -import cats.implicits._ -import fs2.io.file.Path -import fs2.{Stream, io, text} - -import docspell.common.{exec => newExec} -import docspell.logging.Logger - -// better use `SysCmd` and `SysExec` -object SystemCommand { - - final case class Config( - program: String, - args: Seq[String], - timeout: Duration, - env: Map[String, String] = Map.empty - ) { - - def toSysCmd = newExec - .SysCmd(program, newExec.Args(args)) - .withTimeout(timeout) - .addEnv(newExec.Env(env)) - - def mapArgs(f: String => String): Config = - Config(program, args.map(f), timeout) - - def replace(repl: Map[String, String]): Config = - mapArgs(s => - repl.foldLeft(s) { case (res, (k, v)) => - res.replace(k, v) - } - ) - - def withEnv(key: String, value: String): Config = - copy(env = env.updated(key, value)) - - def addEnv(moreEnv: Map[String, String]): Config = - copy(env = env ++ moreEnv) - - def appendArgs(extraArgs: Args): Config = - copy(args = args ++ extraArgs.args) - - def appendArgs(extraArgs: Seq[String]): Config = - copy(args = args ++ extraArgs) - - def toCmd: List[String] = - program :: args.toList - - lazy val cmdString: String = - toCmd.mkString(" ") - } - - final case class Args(args: Vector[String]) extends Iterable[String] { - override def iterator = args.iterator - - def prepend(a: String): Args = Args(a +: args) - - def prependWhen(flag: Boolean)(a: String): Args = - prependOption(Option.when(flag)(a)) - - def prependOption(value: Option[String]): Args = - value.map(prepend).getOrElse(this) - - def append(a: String, as: String*): Args = - Args(args ++ (a +: as.toVector)) - - def appendOption(value: Option[String]): Args = - value.map(append(_)).getOrElse(this) - - def appendOptionVal(first: String, second: Option[String]): Args = - second.map(b => append(first, b)).getOrElse(this) - - def appendWhen(flag: Boolean)(a: String, as: String*): Args = - if (flag) append(a, as: _*) else this - - def appendWhenNot(flag: Boolean)(a: String, as: String*): Args = - if (!flag) append(a, as: _*) else this - - def append(p: Path): Args = - append(p.toString) - - def append(as: Iterable[String]): Args = - Args(args ++ as.toVector) - } - object Args { - val empty: Args = Args() - - def apply(as: String*): Args = - Args(as.toVector) - } - - final case class Result(rc: Int, stdout: String, stderr: String) - - def exec[F[_]: Sync]( - cmd: Config, - logger: Logger[F], - wd: Option[Path] = None, - stdin: Stream[F, Byte] = Stream.empty - ): Stream[F, Result] = - startProcess(cmd, wd, logger, stdin) { proc => - Stream.eval { - for { - _ <- writeToProcess(stdin, proc) - term <- Sync[F].blocking(proc.waitFor(cmd.timeout.seconds, TimeUnit.SECONDS)) - _ <- - if (term) - logger.debug(s"Command `${cmd.cmdString}` finished: ${proc.exitValue}") - else - logger.warn( - s"Command `${cmd.cmdString}` did not finish in ${cmd.timeout.formatExact}!" - ) - _ <- if (!term) timeoutError(proc, cmd) else Sync[F].pure(()) - out <- - if (term) inputStreamToString(proc.getInputStream) - else Sync[F].pure("") - err <- - if (term) inputStreamToString(proc.getErrorStream) - else Sync[F].pure("") - } yield Result(proc.exitValue, out, err) - } - } - - def execSuccess[F[_]: Sync]( - cmd: Config, - logger: Logger[F], - wd: Option[Path] = None, - stdin: Stream[F, Byte] = Stream.empty - ): Stream[F, Result] = - exec(cmd, logger, wd, stdin).flatMap { r => - if (r.rc != 0) - Stream.raiseError[F]( - new Exception( - s"Command `${cmd.cmdString}` returned non-zero exit code ${r.rc}. Stderr: ${r.stderr}" - ) - ) - else Stream.emit(r) - } - - private def startProcess[F[_]: Sync, A]( - cmd: Config, - wd: Option[Path], - logger: Logger[F], - stdin: Stream[F, Byte] - )( - f: Process => Stream[F, A] - ): Stream[F, A] = { - val log = logger.debug(s"Running external command: ${cmd.cmdString}") - val hasStdin = stdin.take(1).compile.last.map(_.isDefined) - val proc = log *> hasStdin.flatMap(flag => - Sync[F].blocking { - val pb = new ProcessBuilder(cmd.toCmd.asJava) - .redirectInput(if (flag) Redirect.PIPE else Redirect.INHERIT) - .redirectError(Redirect.PIPE) - .redirectOutput(Redirect.PIPE) - - val pbEnv = pb.environment() - cmd.env.foreach { case (key, value) => - pbEnv.put(key, value) - } - wd.map(_.toNioPath.toFile).foreach(pb.directory) - pb.start() - } - ) - Stream - .bracket(proc)(p => - logger.debug(s"Closing process: `${cmd.cmdString}`").map(_ => p.destroy()) - ) - .flatMap(f) - } - - private def inputStreamToString[F[_]: Sync](in: InputStream): F[String] = - io.readInputStream(Sync[F].pure(in), 16 * 1024, closeAfterUse = false) - .through(text.utf8.decode) - .chunks - .map(_.toVector.mkString) - .fold1(_ + _) - .compile - .last - .map(_.getOrElse("")) - - private def writeToProcess[F[_]: Sync]( - data: Stream[F, Byte], - proc: Process - ): F[Unit] = - data - .through(io.writeOutputStream(Sync[F].blocking(proc.getOutputStream))) - .compile - .drain - - private def timeoutError[F[_]: Sync](proc: Process, cmd: Config): F[Unit] = - Sync[F].blocking(proc.destroyForcibly()).attempt *> { - Sync[F].raiseError( - new Exception( - s"Command `${cmd.cmdString}` timed out (${cmd.timeout.formatExact})" - ) - ) - } -} diff --git a/modules/common/src/main/scala/docspell/common/exec/Env.scala b/modules/common/src/main/scala/docspell/common/exec/Env.scala index 2524d35ac8..c4bdfafbd9 100644 --- a/modules/common/src/main/scala/docspell/common/exec/Env.scala +++ b/modules/common/src/main/scala/docspell/common/exec/Env.scala @@ -17,6 +17,9 @@ case class Env(values: Map[String, String]) { def addAll(e: Env): Env = Env(values ++ e.values) + def modifyValue(f: String => String): Env = + Env(values.view.mapValues(f).toMap) + def ++(e: Env) = addAll(e) def foreach(f: (String, String) => Unit): Unit = diff --git a/modules/common/src/main/scala/docspell/common/exec/ExternalCommand.scala b/modules/common/src/main/scala/docspell/common/exec/ExternalCommand.scala new file mode 100644 index 0000000000..47b003b0e8 --- /dev/null +++ b/modules/common/src/main/scala/docspell/common/exec/ExternalCommand.scala @@ -0,0 +1,89 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +package docspell.common.exec + +import docspell.common.Duration +import docspell.common.Ident +import docspell.common.exec.Env +import docspell.common.exec.ExternalCommand.ArgMapping +import docspell.common.exec.SysCmd + +final case class ExternalCommand( + program: String, + args: Seq[String], + timeout: Duration, + env: Map[String, String] = Map.empty, + argMappings: Map[Ident, ArgMapping] = Map.empty +) { + def withVars(vars: Map[String, String]): ExternalCommand.WithVars = + ExternalCommand.WithVars(this, vars) + + import ExternalCommand.pattern + + def resolve(vars: Map[String, String]): SysCmd = { + val replace = ExternalCommand.replaceString(vars) _ + val resolvedArgMappings = + argMappings.view.mapValues(_.resolve(replace).firstMatch).toMap + val resolvedArgs = args.map(replace).flatMap { arg => + resolvedArgMappings + .find(e => pattern(e._1.id) == arg) + .map(_._2) + .getOrElse(List(arg)) + } + + SysCmd(replace(program), resolvedArgs: _*) + .withTimeout(timeout) + .withEnv(_ => Env(env).modifyValue(replace)) + } +} + +object ExternalCommand { + private val openPattern = "{{" + private val closePattern = "}}" + + private def pattern(s: String): String = s"${openPattern}${s}${closePattern}" + + def apply(program: String, args: Seq[String], timeout: Duration): ExternalCommand = + ExternalCommand(program, args, timeout, Map.empty, Map.empty) + + final case class ArgMapping( + value: String, + mappings: List[ArgMatch] + ) { + private[exec] def resolve(replace: String => String): ArgMapping = + ArgMapping(replace(value), mappings.map(_.resolve(replace))) + + def firstMatch: List[String] = + mappings.find(am => value.matches(am.matches)).map(_.args).getOrElse(Nil) + } + + final case class ArgMatch( + matches: String, + args: List[String] + ) { + private[exec] def resolve(replace: String => String): ArgMatch = + ArgMatch(replace(matches), args.map(replace)) + } + + private def replaceString(vars: Map[String, String])(in: String): String = + vars.foldLeft(in) { case (result, (name, value)) => + val key = s"{{$name}}" + result.replace(key, value) + } + + final case class WithVars(cmd: ExternalCommand, vars: Map[String, String]) { + def resolved: SysCmd = cmd.resolve(vars) + def append(more: (String, String)*): WithVars = + WithVars(cmd, vars ++ more.toMap) + + def withVar(key: String, value: String): WithVars = + WithVars(cmd, vars.updated(key, value)) + + def withVarOption(key: String, value: Option[String]): WithVars = + value.map(withVar(key, _)).getOrElse(this) + } +} diff --git a/modules/common/src/main/scala/docspell/common/exec/SysExec.scala b/modules/common/src/main/scala/docspell/common/exec/SysExec.scala index da7b10c223..a53a4c6ff9 100644 --- a/modules/common/src/main/scala/docspell/common/exec/SysExec.scala +++ b/modules/common/src/main/scala/docspell/common/exec/SysExec.scala @@ -38,6 +38,20 @@ trait SysExec[F[_]] { def waitFor(timeout: Option[Duration] = None): F[Int] + /** Uses `waitFor` and throws when return code is non-zero. Logs stderr and stdout while + * waiting. + */ + def runToSuccess(logger: Logger[F], timeout: Option[Duration] = None)(implicit + F: Async[F] + ): F[Int] + + /** Uses `waitFor` and throws when return code is non-zero. Logs stderr while waiting + * and collects stdout once finished successfully. + */ + def runToSuccessStdout(logger: Logger[F], timeout: Option[Duration] = None)(implicit + F: Async[F] + ): F[String] + /** Sends a signal to the process to terminate it immediately */ def cancel: F[Unit] @@ -75,6 +89,12 @@ object SysExec { proc <- startProcess(logger, cmd, workdir, stdin) fibers <- Resource.eval(Ref.of[F, List[F[Unit]]](Nil)) } yield new SysExec[F] { + private lazy val basicName: String = + cmd.program.lastIndexOf(java.io.File.separatorChar.toInt) match { + case n if n > 0 => cmd.program.drop(n + 1) + case _ => cmd.program.takeRight(16) + } + def stdout: Stream[F, Byte] = fs2.io.readInputStream( Sync[F].blocking(proc.getInputStream), @@ -107,6 +127,39 @@ object SysExec { ) } + def runToSuccess(logger: Logger[F], timeout: Option[Duration])(implicit + F: Async[F] + ): F[Int] = + logOutputs(logger, basicName).use(_.waitFor(timeout).flatMap { + case rc if rc == 0 => Sync[F].pure(0) + case rc => + Sync[F].raiseError( + new Exception(s"Command `${cmd.program}` returned non-zero exit code ${rc}") + ) + }) + + def runToSuccessStdout(logger: Logger[F], timeout: Option[Duration])(implicit + F: Async[F] + ): F[String] = + F.background( + stderrLines + .through(line => Stream.eval(logger.debug(s"[$basicName (err)]: $line"))) + .compile + .drain + ).use { f1 => + waitFor(timeout) + .flatMap { + case rc if rc == 0 => stdout.through(fs2.text.utf8.decode).compile.string + case rc => + Sync[F].raiseError[String]( + new Exception( + s"Command `${cmd.program}` returned non-zero exit code ${rc}" + ) + ) + } + .flatTap(_ => f1) + } + def consumeOutputs(out: Pipe[F, String, Unit], err: Pipe[F, String, Unit])(implicit F: Async[F] ): Resource[F, SysExec[F]] = diff --git a/modules/common/src/test/scala/docspell/common/exec/ExternalCommandTest.scala b/modules/common/src/test/scala/docspell/common/exec/ExternalCommandTest.scala new file mode 100644 index 0000000000..120e68cfdb --- /dev/null +++ b/modules/common/src/test/scala/docspell/common/exec/ExternalCommandTest.scala @@ -0,0 +1,74 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +package docspell.common.exec + +import docspell.common.Duration +import docspell.common.Ident +import docspell.common.exec.Args +import docspell.common.exec.Env +import docspell.common.exec.ExternalCommand._ +import docspell.common.exec.SysCmd + +import munit.FunSuite + +class ExternalCommandTest extends FunSuite { + + test("resolve") { + val cmd = ExternalCommand( + program = "tesseract", + args = "{{infile}}" :: "{{lang-spec}}" :: "out" :: "pdf" :: "txt" :: Nil, + timeout = Duration.minutes(5), + env = Map.empty, + argMappings = Map( + Ident.unsafe("lang-spec") -> ArgMapping( + value = "{{lang}}", + mappings = List( + ArgMatch( + matches = "jpn_vert", + args = List("-l", "jpn_vert", "-c", "preserve_interword_spaces=1") + ), + ArgMatch( + matches = ".*", + args = List("-l", "{{lang}}") + ) + ) + ) + ) + ) + + val varsDe = Map("lang" -> "de", "encoding" -> "UTF_8", "infile" -> "text.jpg") + assertEquals( + cmd.resolve(varsDe), + SysCmd( + "tesseract", + Args.of("text.jpg", "-l", "de", "out", "pdf", "txt"), + Env.empty, + Duration.minutes(5) + ) + ) + + val varsJpnVert = varsDe.updated("lang", "jpn_vert") + assertEquals( + cmd.resolve(varsJpnVert), + SysCmd( + "tesseract", + Args.of( + "text.jpg", + "-l", + "jpn_vert", + "-c", + "preserve_interword_spaces=1", + "out", + "pdf", + "txt" + ), + Env.empty, + Duration.minutes(5) + ) + ) + } +} diff --git a/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala b/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala index 5f1253f570..0b7981cd0a 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala @@ -11,7 +11,8 @@ import cats.implicits._ import fs2.io.file.{Files, Path} import fs2.{Pipe, Stream} -import docspell.common._ +import docspell.common.exec.ExternalCommand +import docspell.common.exec.SysExec import docspell.common.util.File import docspell.convert.ConversionResult import docspell.convert.ConversionResult.{Handler, successPdf, successPdfTxt} @@ -21,11 +22,11 @@ private[extern] object ExternConv { def toPDF[F[_]: Async: Files, A]( name: String, - cmdCfg: SystemCommand.Config, + cmdCfg: ExternalCommand.WithVars, wd: Path, useStdin: Boolean, logger: Logger[F], - reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] + reader: (Path, Int) => F[ConversionResult[F]] )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = Stream .resource(File.withTempDir[F](wd, s"docspell-$name")) @@ -33,32 +34,21 @@ private[extern] object ExternConv { val inFile = dir.resolve("infile").absolute.normalize val out = dir.resolve("out.pdf").absolute.normalize val sysCfg = - cmdCfg.replace( - Map( - "{{outfile}}" -> out.toString - ) ++ - (if (!useStdin) Map("{{infile}}" -> inFile.toString) - else Map.empty) - ) + cmdCfg + .withVar("outfile", out.toString) + .withVarOption("infile", Option.when(!useStdin)(inFile.toString)) + .resolved val createInput: Pipe[F, Byte, Unit] = if (useStdin) _ => Stream.emit(()) else storeDataToFile(name, logger, inFile) - in.through(createInput).flatMap { _ => - SystemCommand - .exec[F]( - sysCfg, - logger, - Some(dir), - if (useStdin) in - else Stream.empty - ) - .evalMap(result => - logResult(name, result, logger) - .flatMap(_ => reader(out, result)) - .flatMap(handler.run) - ) + in.through(createInput).evalMap { _ => + SysExec(sysCfg, logger, Some(dir), Option.when(useStdin)(in)) + .flatMap(_.logOutputs(logger, name)) + .use { proc => + proc.waitFor().flatMap(rc => reader(out, rc).flatMap(handler.run)) + } } } .compile @@ -74,9 +64,9 @@ private[extern] object ExternConv { def readResult[F[_]: Async: Files]( chunkSize: Int, logger: Logger[F] - )(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] = + )(out: Path, result: Int): F[ConversionResult[F]] = File.existsNonEmpty[F](out).flatMap { - case true if result.rc == 0 => + case true if result == 0 => val outTxt = out.resolveSibling(out.fileName.toString + ".txt") File.existsNonEmpty[F](outTxt).flatMap { case true => @@ -88,13 +78,13 @@ private[extern] object ExternConv { successPdf(File.readAll(out, chunkSize)).pure[F] } case true => - logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *> + logger.warn(s"Command not successful (rc=${result}), but file exists.") *> successPdf(File.readAll(out, chunkSize)).pure[F] case false => ConversionResult .failure[F]( - new Exception(s"Command result=${result.rc}. No output file found.") + new Exception(s"Command result=${result}. No output file found.") ) .pure[F] } @@ -103,25 +93,25 @@ private[extern] object ExternConv { outPrefix: String, chunkSize: Int, logger: Logger[F] - )(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] = { + )(out: Path, result: Int): F[ConversionResult[F]] = { val outPdf = out.resolveSibling(s"$outPrefix.pdf") File.existsNonEmpty[F](outPdf).flatMap { case true => val outTxt = out.resolveSibling(s"$outPrefix.txt") File.exists(outTxt).flatMap { txtExists => val pdfData = File.readAll(out, chunkSize) - if (result.rc == 0) + if (result == 0) if (txtExists) successPdfTxt(pdfData, File.readText(outTxt)).pure[F] else successPdf(pdfData).pure[F] else - logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *> + logger.warn(s"Command not successful (rc=${result}), but file exists.") *> successPdf(pdfData).pure[F] } case false => ConversionResult .failure[F]( - new Exception(s"Command result=${result.rc}. No output file found.") + new Exception(s"Command result=${result}. No output file found.") ) .pure[F] } @@ -138,14 +128,6 @@ private[extern] object ExternConv { .drain ++ Stream.eval(storeFile(in, inFile)) - private def logResult[F[_]: Sync]( - name: String, - result: SystemCommand.Result, - logger: Logger[F] - ): F[Unit] = - logger.debug(s"$name stdout: ${result.stdout}") *> - logger.debug(s"$name stderr: ${result.stderr}") - private def storeFile[F[_]: Async: Files]( in: Stream[F, Byte], target: Path diff --git a/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdf.scala b/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdf.scala index 1150d913a8..9b350ba407 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdf.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdf.scala @@ -24,12 +24,14 @@ object OcrMyPdf { logger: Logger[F] )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = if (cfg.enabled) { - val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] = + val reader: (Path, Int) => F[ConversionResult[F]] = ExternConv.readResult[F](chunkSize, logger) + val cmd = cfg.command.withVars(Map("lang" -> lang.iso3)) + ExternConv.toPDF[F, A]( "ocrmypdf", - cfg.command.replace(Map("{{lang}}" -> lang.iso3)), + cmd, cfg.workingDir, useStdin = false, logger, diff --git a/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdfConfig.scala b/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdfConfig.scala index 726469ce77..fa85b4ee6b 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdfConfig.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdfConfig.scala @@ -8,10 +8,10 @@ package docspell.convert.extern import fs2.io.file.Path -import docspell.common.SystemCommand +import docspell.common.exec.ExternalCommand case class OcrMyPdfConfig( enabled: Boolean, - command: SystemCommand.Config, + command: ExternalCommand, workingDir: Path ) diff --git a/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala b/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala index 39007c6c03..50ea2dd9e2 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala @@ -24,17 +24,18 @@ object Tesseract { logger: Logger[F] )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = { val outBase = cfg.command.args.tail.headOption.getOrElse("out") - val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] = + val reader: (Path, Int) => F[ConversionResult[F]] = ExternConv.readResultTesseract[F](outBase, chunkSize, logger) + val cmd = cfg.command.withVars(Map("lang" -> lang.iso3)) + ExternConv.toPDF[F, A]( "tesseract", - cfg.command.replace(Map("{{lang}}" -> lang.iso3)), + cmd, cfg.workingDir, useStdin = false, logger, reader )(in, handler) } - } diff --git a/modules/convert/src/main/scala/docspell/convert/extern/TesseractConfig.scala b/modules/convert/src/main/scala/docspell/convert/extern/TesseractConfig.scala index d2f097806b..aae31fccaf 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/TesseractConfig.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/TesseractConfig.scala @@ -8,6 +8,6 @@ package docspell.convert.extern import fs2.io.file.Path -import docspell.common.SystemCommand +import docspell.common.exec.ExternalCommand -case class TesseractConfig(command: SystemCommand.Config, workingDir: Path) +case class TesseractConfig(command: ExternalCommand, workingDir: Path) diff --git a/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala b/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala index 01b9a4449e..102987d25b 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala @@ -10,7 +10,6 @@ import cats.effect._ import fs2.Stream import fs2.io.file.{Files, Path} -import docspell.common._ import docspell.convert.ConversionResult import docspell.convert.ConversionResult.Handler import docspell.logging.Logger @@ -22,12 +21,13 @@ object Unoconv { chunkSize: Int, logger: Logger[F] )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = { - val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] = + val reader: (Path, Int) => F[ConversionResult[F]] = ExternConv.readResult[F](chunkSize, logger) + val cmd = cfg.command.withVars(Map.empty) ExternConv.toPDF[F, A]( "unoconv", - cfg.command, + cmd, cfg.workingDir, useStdin = false, logger, @@ -37,5 +37,4 @@ object Unoconv { handler ) } - } diff --git a/modules/convert/src/main/scala/docspell/convert/extern/UnoconvConfig.scala b/modules/convert/src/main/scala/docspell/convert/extern/UnoconvConfig.scala index 8fe0d209ac..140849663c 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/UnoconvConfig.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/UnoconvConfig.scala @@ -8,6 +8,6 @@ package docspell.convert.extern import fs2.io.file.Path -import docspell.common.SystemCommand +import docspell.common.exec.ExternalCommand -case class UnoconvConfig(command: SystemCommand.Config, workingDir: Path) +case class UnoconvConfig(command: ExternalCommand, workingDir: Path) diff --git a/modules/convert/src/main/scala/docspell/convert/extern/Weasyprint.scala b/modules/convert/src/main/scala/docspell/convert/extern/Weasyprint.scala index 2470d0fed4..5c411d3474 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/Weasyprint.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/Weasyprint.scala @@ -27,10 +27,10 @@ object Weasyprint { sanitizeHtml: SanitizeHtml, logger: Logger[F] )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = { - val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] = + val reader: (Path, Int) => F[ConversionResult[F]] = ExternConv.readResult[F](chunkSize, logger) - val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name())) + val cmdCfg = cfg.command.withVars(Map("encoding" -> charset.name())) // html sanitize should (among other) remove links to invalid // protocols like cid: which is not supported by further @@ -51,5 +51,4 @@ object Weasyprint { handler ) } - } diff --git a/modules/convert/src/main/scala/docspell/convert/extern/WeasyprintConfig.scala b/modules/convert/src/main/scala/docspell/convert/extern/WeasyprintConfig.scala index 2ce485cc7c..dca425a83d 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/WeasyprintConfig.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/WeasyprintConfig.scala @@ -8,6 +8,6 @@ package docspell.convert.extern import fs2.io.file.Path -import docspell.common.SystemCommand +import docspell.common.exec.ExternalCommand -case class WeasyprintConfig(command: SystemCommand.Config, workingDir: Path) +case class WeasyprintConfig(command: ExternalCommand, workingDir: Path) diff --git a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala index 04e973fec6..f22d9ada6d 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala @@ -27,10 +27,10 @@ object WkHtmlPdf { sanitizeHtml: SanitizeHtml, logger: Logger[F] )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = { - val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] = + val reader: (Path, Int) => F[ConversionResult[F]] = ExternConv.readResult[F](chunkSize, logger) - val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name())) + val cmdCfg = cfg.command.withVars(Map("encoding" -> charset.name())) // html sanitize should (among other) remove links to invalid // protocols like cid: which is not supported by further @@ -58,5 +58,4 @@ object WkHtmlPdf { handler ) } - } diff --git a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdfConfig.scala b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdfConfig.scala index 52b71ea337..293d15aa90 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdfConfig.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdfConfig.scala @@ -8,6 +8,6 @@ package docspell.convert.extern import fs2.io.file.Path -import docspell.common.SystemCommand +import docspell.common.exec.ExternalCommand -case class WkHtmlPdfConfig(command: SystemCommand.Config, workingDir: Path) +case class WkHtmlPdfConfig(command: ExternalCommand, workingDir: Path) diff --git a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala index 0cb2ca6574..cfbd08374e 100644 --- a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala +++ b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala @@ -15,6 +15,7 @@ import cats.implicits._ import fs2.Stream import docspell.common._ +import docspell.common.exec._ import docspell.common.util.File import docspell.convert.ConversionResult.Handler import docspell.convert.ConvertConfig.HtmlConverter @@ -36,7 +37,7 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig { 3000 * 3000, MarkdownConfig("body { padding: 2em 5em; }"), WkHtmlPdfConfig( - SystemCommand.Config( + ExternalCommand( "wkhtmltopdf", Seq("-s", "A4", "--encoding", "UTF-8", "-", "{{outfile}}"), Duration.seconds(20) @@ -44,7 +45,7 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig { target ), WeasyprintConfig( - SystemCommand.Config( + ExternalCommand( "weasyprint", Seq("--encoding", "UTF-8", "-", "{{outfile}}"), Duration.seconds(20) @@ -53,7 +54,7 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig { ), HtmlConverter.Wkhtmltopdf, TesseractConfig( - SystemCommand.Config( + ExternalCommand( "tesseract", Seq("{{infile}}", "out", "-l", "deu", "pdf", "txt"), Duration.seconds(20) @@ -61,7 +62,7 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig { target ), UnoconvConfig( - SystemCommand.Config( + ExternalCommand( "unoconv", Seq("-f", "pdf", "-o", "{{outfile}}", "{{infile}}"), Duration.seconds(20) @@ -70,7 +71,7 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig { ), OcrMyPdfConfig( enabled = true, - SystemCommand.Config( + ExternalCommand( "ocrmypdf", Seq( "-l", diff --git a/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala b/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala index 9beaed28c6..67d1e5ae76 100644 --- a/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala +++ b/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala @@ -14,6 +14,7 @@ import cats.effect.unsafe.implicits.global import fs2.io.file.Path import docspell.common._ +import docspell.common.exec._ import docspell.common.util.File import docspell.convert._ import docspell.files.ExampleFiles @@ -27,7 +28,7 @@ class ExternConvTest extends FunSuite with FileChecks with TestLoggingConfig { val target = File.path(Paths.get("target")) test("convert html to pdf") { - val cfg = SystemCommand.Config( + val cfg = ExternalCommand( "wkhtmltopdf", Seq("-s", "A4", "--encoding", "UTF-8", "-", "{{outfile}}"), Duration.seconds(20) @@ -53,7 +54,7 @@ class ExternConvTest extends FunSuite with FileChecks with TestLoggingConfig { } test("convert office to pdf") { - val cfg = SystemCommand.Config( + val cfg = ExternalCommand( "unoconv", Seq("-f", "pdf", "-o", "{{outfile}}", "{{infile}}"), Duration.seconds(20) @@ -80,7 +81,7 @@ class ExternConvTest extends FunSuite with FileChecks with TestLoggingConfig { } test("convert image to pdf") { - val cfg = SystemCommand.Config( + val cfg = ExternalCommand( "tesseract", Seq("{{infile}}", "out", "-l", "deu", "pdf", "txt"), Duration.seconds(20) @@ -105,5 +106,4 @@ class ExternConvTest extends FunSuite with FileChecks with TestLoggingConfig { ) .unsafeRunSync() } - } diff --git a/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala b/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala index b082820100..f70c46b612 100644 --- a/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala @@ -10,7 +10,8 @@ import cats.effect._ import fs2.Stream import fs2.io.file.{Files, Path} -import docspell.common._ +import docspell.common.exec.ExternalCommand +import docspell.common.exec.SysExec import docspell.common.util.File import docspell.logging.Logger @@ -77,14 +78,17 @@ object Ocr { else cfg.ghostscript.command.args val cmd = cfg.ghostscript.command .copy(args = xargs) - .replace( + .withVars( Map( - "{{infile}}" -> "-", - "{{outfile}}" -> "%d.tif" + "infile" -> "-", + "outfile" -> "%d.tif" ) ) - SystemCommand - .execSuccess(cmd, logger, wd = Some(wd), stdin = pdf) + .resolved + + Stream + .resource(SysExec(cmd, logger, Some(wd), Some(pdf))) + .evalMap(_.runToSuccess(logger)) .flatMap(_ => File.listFiles(pathEndsWith(".tif"), wd)) } @@ -93,18 +97,22 @@ object Ocr { */ private[extract] def runGhostscriptFile[F[_]: Async: Files]( pdf: Path, - ghostscript: SystemCommand.Config, + ghostscript: ExternalCommand, wd: Path, logger: Logger[F] ): Stream[F, Path] = { - val cmd = ghostscript.replace( - Map( - "{{infile}}" -> pdf.absolute.toString, - "{{outfile}}" -> "%d.tif" + val cmd = ghostscript + .withVars( + Map( + "infile" -> pdf.absolute.toString, + "outfile" -> "%d.tif" + ) ) - ) - SystemCommand - .execSuccess[F](cmd, logger, wd = Some(wd)) + .resolved + + Stream + .resource(SysExec(cmd, logger, Some(wd))) + .evalMap(_.runToSuccess(logger)) .flatMap(_ => File.listFiles(pathEndsWith(".tif"), wd)) } @@ -116,19 +124,23 @@ object Ocr { */ private[extract] def runUnpaperFile[F[_]: Async]( img: Path, - unpaper: SystemCommand.Config, + unpaper: ExternalCommand, wd: Option[Path], logger: Logger[F] ): Stream[F, Path] = { val targetFile = img.resolveSibling("u-" + img.fileName.toString).absolute - val cmd = unpaper.replace( - Map( - "{{infile}}" -> img.absolute.toString, - "{{outfile}}" -> targetFile.toString + val cmd = unpaper + .withVars( + Map( + "infile" -> img.absolute.toString, + "outfile" -> targetFile.toString + ) ) - ) - SystemCommand - .execSuccess[F](cmd, logger, wd = wd) + .resolved + + Stream + .resource(SysExec(cmd, logger, wd)) + .evalMap(_.runToSuccess(logger)) .map(_ => targetFile) .handleErrorWith { th => logger @@ -150,12 +162,14 @@ object Ocr { // so use the parent as working dir runUnpaperFile(img, config.unpaper.command, img.parent, logger).flatMap { uimg => val cmd = config.tesseract.command - .replace( - Map("{{file}}" -> uimg.fileName.toString, "{{lang}}" -> fixLanguage(lang)) + .withVars( + Map("file" -> uimg.fileName.toString, "lang" -> fixLanguage(lang)) ) - SystemCommand - .execSuccess[F](cmd, logger, wd = uimg.parent) - .map(_.stdout) + .resolved + + Stream + .resource(SysExec(cmd, logger, uimg.parent)) + .evalMap(_.runToSuccessStdout(logger)) } /** Run tesseract on the given image file and return the extracted text. */ @@ -166,8 +180,12 @@ object Ocr { config: OcrConfig ): Stream[F, String] = { val cmd = config.tesseract.command - .replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang))) - SystemCommand.execSuccess(cmd, logger, stdin = img).map(_.stdout) + .withVars(Map("file" -> "stdin", "lang" -> fixLanguage(lang))) + .resolved + + Stream + .resource(SysExec(cmd, logger, None, Some(img))) + .evalMap(_.runToSuccessStdout(logger)) } private def fixLanguage(lang: String): String = diff --git a/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala b/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala index 856c21a308..6170f62bee 100644 --- a/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala @@ -6,12 +6,9 @@ package docspell.extract.ocr -import java.nio.file.Paths - import fs2.io.file.Path -import docspell.common._ -import docspell.common.util.File +import docspell.common.exec.ExternalCommand case class OcrConfig( maxImageSize: Int, @@ -25,43 +22,10 @@ object OcrConfig { case class PageRange(begin: Int) - case class Ghostscript(command: SystemCommand.Config, workingDir: Path) + case class Ghostscript(command: ExternalCommand, workingDir: Path) - case class Tesseract(command: SystemCommand.Config) + case class Tesseract(command: ExternalCommand) - case class Unpaper(command: SystemCommand.Config) + case class Unpaper(command: ExternalCommand) - val default = OcrConfig( - maxImageSize = 3000 * 3000, - pageRange = PageRange(10), - ghostscript = Ghostscript( - SystemCommand.Config( - "gs", - Seq( - "-dNOPAUSE", - "-dBATCH", - "-dSAFER", - "-sDEVICE=tiffscaled8", - "-sOutputFile={{outfile}}", - "{{infile}}" - ), - Duration.seconds(30) - ), - File.path( - Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction") - ) - ), - unpaper = Unpaper( - SystemCommand - .Config("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30)) - ), - tesseract = Tesseract( - SystemCommand - .Config( - "tesseract", - Seq("{{file}}", "stdout", "-l", "{{lang}}"), - Duration.minutes(1) - ) - ) - ) } diff --git a/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala b/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala index 71d55ad81c..7aa6a07277 100644 --- a/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala +++ b/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala @@ -6,9 +6,14 @@ package docspell.extract.ocr +import java.nio.file.Paths + import cats.effect.IO import cats.effect.unsafe.implicits.global +import docspell.common.Duration +import docspell.common.exec.ExternalCommand +import docspell.common.util.File import docspell.files.TestFiles import docspell.logging.TestLoggingConfig @@ -21,7 +26,7 @@ class TextExtractionSuite extends FunSuite with TestLoggingConfig { test("extract english pdf".ignore) { val text = TextExtract - .extract[IO](letterSourceEN, logger, "eng", OcrConfig.default) + .extract[IO](letterSourceEN, logger, "eng", TextExtractionSuite.defaultConfig) .compile .lastOrError .unsafeRunSync() @@ -31,7 +36,7 @@ class TextExtractionSuite extends FunSuite with TestLoggingConfig { test("extract german pdf".ignore) { val expect = TestFiles.letterDEText val extract = TextExtract - .extract[IO](letterSourceDE, logger, "deu", OcrConfig.default) + .extract[IO](letterSourceDE, logger, "deu", TextExtractionSuite.defaultConfig) .compile .lastOrError .unsafeRunSync() @@ -39,3 +44,37 @@ class TextExtractionSuite extends FunSuite with TestLoggingConfig { assertEquals(extract.value, expect) } } + +object TextExtractionSuite { + val defaultConfig = OcrConfig( + maxImageSize = 3000 * 3000, + pageRange = OcrConfig.PageRange(10), + ghostscript = OcrConfig.Ghostscript( + ExternalCommand( + "gs", + Seq( + "-dNOPAUSE", + "-dBATCH", + "-dSAFER", + "-sDEVICE=tiffscaled8", + "-sOutputFile={{outfile}}", + "{{infile}}" + ), + Duration.seconds(30) + ), + File.path( + Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction") + ) + ), + unpaper = OcrConfig.Unpaper( + ExternalCommand("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30)) + ), + tesseract = OcrConfig.Tesseract( + ExternalCommand( + "tesseract", + Seq("{{file}}", "stdout", "-l", "{{lang}}"), + Duration.minutes(1) + ) + ) + ) +}