From d760b8d16b0e2fd944ae2348f7931377c4ff399c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20G=C3=BCthler?= Date: Mon, 14 Aug 2023 14:34:17 +0200 Subject: [PATCH] Add option to set `fetcherName` for Tika >= 2.0.0 In Tika >= 2.0.0, fetching remote files via the server is done using so called [fetchers](https://cwiki.apache.org/confluence/display/TIKA/tika-pipes). If you are running a Tika Server that is configured to use an HTTP fetcher, you need the client to tell the server which fetcher to use, which is done by adding the HTTP header `fetcherName` to the request. Furthermore, the URL of the remote file to be fetched must be passed using a `fetchKey` header instead `fetchUrl` as in Tika 1.x.x. This adds a public API method to set the fetcher name, and replaces the `fileUrl` header with `fetcherName` and `fetchKey` if a fetcher name is set. If no fetcher name is set, the `fileUrl` header is still added to the request as usual to keep TIKA 1.x.x compatibility. --- README.md | 11 +++++++++++ src/Clients/WebClient.php | 24 +++++++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 117157c..aae3c58 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,11 @@ You can use an URL instead of a file path and the library will download the file **no need** to add `-enableUnsecureFeatures -enableFileUrl` to command line when starting the server, as described [here](https://wiki.apache.org/tika/TikaJAXRS#Specifying_a_URL_Instead_of_Putting_Bytes). +If you use Apache Tika >= 2.0.0, you *can* [define an HttpFetcher](https://cwiki.apache.org/confluence/display/TIKA/tika-pipes) +and use the option `-enableUnsecureFeatures -enableFileUrl` when starting the server to make the server download remote +files when passing a URL instead of a filname to `$client->getText()`. In order to do so, you must set the name of +the HttpFetcher using `$client->setFetcherName('yourFetcherName')`. + ### Methods Here are the full list of available methods @@ -254,6 +259,12 @@ $client->setOCRLanguages($languages); $client->getOCRLanguages(); ``` +Set HTTP fetcher name (for Tika >= 2.0.0 only, see https://cwiki.apache.org/confluence/display/TIKA/tika-pipes) + +```php +$client->setFetcherName($fetcherName) +``` + ### Breaking changes Since 1.0 version there are some breaking changes: diff --git a/src/Clients/WebClient.php b/src/Clients/WebClient.php index 322d506..6958aea 100644 --- a/src/Clients/WebClient.php +++ b/src/Clients/WebClient.php @@ -51,6 +51,13 @@ class WebClient extends Client */ protected $retries = 3; + /** + * Name of the fetcher to be used (for Tika >= 2.0.0 only) + * + * @var string|null + */ + protected $fetcherName = null; + /** * Default cURL options * @@ -208,6 +215,16 @@ public function setRetries(int $retries): self return $this; } + /** + * Set the name of the fetcher to be used (for Tika >= 2.0.0 only) + */ + public function setFetcherName(string $fetcherName): self + { + $this->fetcherName = $fetcherName; + + return $this; + } + /** * Get all the options */ @@ -626,7 +643,12 @@ protected function getParameters(string $type, string $file = null): array if(!empty($file) && preg_match('/^http/', $file)) { - $headers[] = "fileUrl:$file"; + if($this->fetcherName) { + $headers[] = "fetcherName:$this->fetcherName"; + $headers[] = "fetchKey:$file"; + } else { + $headers[] = "fileUrl:$file"; + } } switch($type)