diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml new file mode 100644 index 0000000..3dbefd7 --- /dev/null +++ b/.github/workflows/maven.yml @@ -0,0 +1,35 @@ +# This workflow will build a Java project with Maven +# For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven + +name: Java CI with Maven + +on: + push: + branches: + - main + - "*.x" + pull_request: + branches: + - main + - "*.x" + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up JDK 17 + uses: actions/setup-java@v2 + with: + java-version: '17' + distribution: 'temurin' + - uses: actions/cache@v1 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven- + - name: Build with Maven + run: mvn -B source:jar javadoc:jar package --file pom.xml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fb17dd9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +/target/ +/work/ +/bin/ +/.settings/ +.project +.classpath +*.iml +.idea +.DS_Store +dependency-reduced-pom.xml diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..4b08490 --- /dev/null +++ b/README.md @@ -0,0 +1,23 @@ +# Fess for Multimodal Search +[![Java CI with Maven](https://github.com/codelibs/fess-webapp-multimodal/actions/workflows/maven.yml/badge.svg)](https://github.com/codelibs/fess-webapp-multimodal/actions/workflows/maven.yml) + +## Overview + +This is a multimodal-search plugin for Fess, enabling the crawling and indexing of various media formats such as text, images, audio, and video. + +## Download + +See [Maven Repository](https://repo1.maven.org/maven2/org/codelibs/fess/fess-webapp-multimodal/). + +## Installation + +See [Plugin](https://fess.codelibs.org/14.15/admin/plugin-guide.html) of Administration guide. + +## Contributing + +We welcome contributions to enhance the functionality of this plugin. Please fork the repository and submit pull requests. + +## License + +This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details. + diff --git a/docker/cache/clip/.keep b/docker/cache/clip/.keep new file mode 100644 index 0000000..e69de29 diff --git a/docker/cache/jina/.keep b/docker/cache/jina/.keep new file mode 100644 index 0000000..e69de29 diff --git a/docker/clip_config.yaml b/docker/clip_config.yaml new file mode 100644 index 0000000..80b7af5 --- /dev/null +++ b/docker/clip_config.yaml @@ -0,0 +1,14 @@ +jtype: Flow +version: '1' +with: + port: 51000 + protocol: http + cors: true +executors: + - name: clip_t + uses: + jtype: CLIPEncoder + metas: + py_modules: + - clip_server.executors.clip_torch + diff --git a/docker/compose.yaml b/docker/compose.yaml new file mode 100644 index 0000000..bdbfb06 --- /dev/null +++ b/docker/compose.yaml @@ -0,0 +1,20 @@ +services: + clip_server: + container_name: clip_server + image: jinaai/clip-server + ports: + - "51000:51000" + volumes: + - ./cache:/home/cas/.cache + - ./clip_config.yaml:/home/cas/clip_config.yaml + environment: + - JINA_HIDE_SURVEY=1 + - JINA_LOG_LEVEL=DEBUG + deploy: + resources: + reservations: + devices: + - capabilities: [gpu] + command: ["/home/cas/clip_config.yaml"] + + diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..a215a14 --- /dev/null +++ b/pom.xml @@ -0,0 +1,122 @@ + + + 4.0.0 + fess-webapp-multimodal + 14.15.0-SNAPSHOT + jar + fess-webapp-multimodal + + scm:git:git@github.com:codelibs/fess-webapp-multimodal.git + + scm:git:git@github.com:codelibs/fess-webapp-multimodal.git + https://github.com/codelibs/fess-webapp-multimodal + + + org.codelibs.fess + fess-parent + 14.15.0-SNAPSHOT + + + + UTF-8 + + + + + maven-compiler-plugin + + + maven-source-plugin + + + net.revelc.code.formatter + formatter-maven-plugin + + + maven-javadoc-plugin + + + maven-jar-plugin + + + + + org.codelibs.fess.crawler.multimodal + + + + + + com.mycila + license-maven-plugin + + + maven-surefire-plugin + + + org.jacoco + jacoco-maven-plugin + + + + + + snapshots.oss.sonatype.org + https://oss.sonatype.org/content/repositories/snapshots + + false + + + true + + + + + + org.codelibs + corelib + ${corelib.version} + + + org.codelibs + curl4j + ${curl4j.version} + + + org.codelibs.fess + fess + ${fess.version} + + + org.codelibs.fess + fess-crawler + ${crawler.version} + + + org.opensearch + opensearch + ${opensearch.version} + provided + + + org.apache.logging.log4j + log4j-api + ${log4j.version} + provided + + + junit + junit + ${junit.version} + test + + + org.dbflute.utflute + utflute-core + ${utflute.version} + test + + + diff --git a/src/main/java/org/codelibs/fess/MultiModalConstants.java b/src/main/java/org/codelibs/fess/MultiModalConstants.java new file mode 100644 index 0000000..d0ff788 --- /dev/null +++ b/src/main/java/org/codelibs/fess/MultiModalConstants.java @@ -0,0 +1,24 @@ +/* + * Copyright 2012-2024 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess; + +public class MultiModalConstants { + public static final String X_FESS_EMBEDDING = "X-FESS-Embedding"; + + private MultiModalConstants() { + // nothing + } +} diff --git a/src/main/java/org/codelibs/fess/client/CasClient.java b/src/main/java/org/codelibs/fess/client/CasClient.java new file mode 100644 index 0000000..af3c669 --- /dev/null +++ b/src/main/java/org/codelibs/fess/client/CasClient.java @@ -0,0 +1,156 @@ +/* + * Copyright 2012-2024 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.client; + +import java.awt.Image; +import java.awt.Rectangle; +import java.awt.image.BufferedImage; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Base64; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.function.Function; + +import javax.annotation.PostConstruct; +import javax.imageio.ImageIO; +import javax.imageio.ImageReadParam; +import javax.imageio.ImageReader; +import javax.imageio.stream.ImageInputStream; + +import org.apache.commons.text.StringEscapeUtils; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.codelibs.curl.Curl; +import org.codelibs.curl.CurlException; +import org.codelibs.curl.CurlResponse; +import org.codelibs.fess.exception.CasAccessException; +import org.opensearch.common.xcontent.LoggingDeprecationHandler; +import org.opensearch.common.xcontent.json.JsonXContent; +import org.opensearch.core.xcontent.NamedXContentRegistry; + +public class CasClient { + private static final Logger logger = LogManager.getLogger(CasClient.class); + + protected static final Function> PARSER = response -> { + try (InputStream is = response.getContentAsStream()) { + return JsonXContent.jsonXContent.createParser(NamedXContentRegistry.EMPTY, LoggingDeprecationHandler.INSTANCE, is).map(); + } catch (final Exception e) { + throw new CurlException("Failed to access the content.", e); + } + }; + + protected int imageWidth; + + protected int imageHeight; + + protected int maxImageWidth; + + protected int maxImageHeight; + + protected String imageFormat; + + protected String clipEndpoint; + + @PostConstruct + public void init() { + imageWidth = Integer.getInteger("clip.image.width", 224); + imageHeight = Integer.getInteger("clip.image.height", 224); + maxImageWidth = Integer.getInteger("clip.image.max_width", 1000); + maxImageHeight = Integer.getInteger("clip.image.max_height", 1000); + imageFormat = System.getProperty("clip.image.format", "png"); + clipEndpoint = System.getProperty("clip.server.endpoint", "http://localhost:51000"); + + logger.debug("image: {}x{}, max: {}x{}, format: {}, endpoint: {}", imageWidth, imageHeight, maxImageWidth, maxImageHeight, + imageFormat, clipEndpoint); + } + + public float[] getImageEmbedding(final InputStream in) { + return sendImage(encodeImage(in)); + } + + protected float[] sendImage(final String encodedImage) { + final String body = "{\"data\":[{\"blob\":\"" + StringEscapeUtils.escapeJson(encodedImage) + "\"}],\"execEndpoint\":\"/\"}"; + logger.debug("request body: {}", body); + try (CurlResponse response = Curl.post(clipEndpoint + "/post").header("Content-Type", "application/json").body(body).execute()) { + final Map contentMap = response.getContent(PARSER); + if (((contentMap.get("data") instanceof final List dataList) + && (!dataList.isEmpty() && dataList.get(0) instanceof final Map data)) + && (data.get("embedding") instanceof final List embeddingList)) { + logger.debug("embedding: {}", embeddingList); + final float[] embedding = new float[embeddingList.size()]; + for (int i = 0; i < embedding.length; i++) { + embedding[i] = ((Number) embeddingList.get(i)).floatValue(); + } + return embedding; + } + } catch (final IOException e) { + throw new CasAccessException("Clip server failed to generate an embedding.", e); + } + throw new CasAccessException("Clip server cannot generate an embedding"); + } + + protected String encodeImage(final InputStream in) { + try (ImageInputStream input = ImageIO.createImageInputStream(in)) { + final Iterator readers = ImageIO.getImageReaders(input); + if (readers.hasNext()) { + final ImageReader reader = readers.next(); + try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { + reader.setInput(input); + final ImageReadParam param = reader.getDefaultReadParam(); + final int width = reader.getWidth(0); + final int height = reader.getHeight(0); + if (width <= 0 || height <= 0 || width > maxImageWidth || height > maxImageHeight) { + throw new CasAccessException("Invalid image size: " + width + "x" + height); + } + + final float aspectRatio = (float) width / height; + int newWidth = imageWidth; + int newHeight = imageHeight; + if (aspectRatio > 1) { + newHeight = (int) (imageWidth / aspectRatio); + } else { + newWidth = (int) (imageHeight * aspectRatio); + } + + final int samplingWidth = width / newWidth; + final int samplingHeight = height / newHeight; + param.setSourceSubsampling(samplingWidth <= 0 ? 1 : samplingWidth, samplingHeight <= 0 ? 1 : samplingHeight, 0, 0); + param.setSourceRegion(new Rectangle(width, height)); + + final BufferedImage image = reader.read(0, param); + final BufferedImage clipImage = new BufferedImage(imageWidth, imageHeight, image.getType()); + final int x = (imageWidth - newWidth) / 2; + final int y = (imageHeight - newHeight) / 2; + clipImage.getGraphics().drawImage(image.getScaledInstance(newWidth, newHeight, Image.SCALE_AREA_AVERAGING), x, y, + newWidth, newHeight, null); + ImageIO.write(clipImage, imageFormat, out); + image.flush(); + return Base64.getEncoder().encodeToString(out.toByteArray()); + } finally { + reader.dispose(); + } + } + throw new CasAccessException("No image."); + } catch (final CasAccessException e) { + throw e; + } catch (final IOException e) { + throw new CasAccessException("Failed to read an image.", e); + } + } +} diff --git a/src/main/java/org/codelibs/fess/crawler/extractor/CasExtractor.java b/src/main/java/org/codelibs/fess/crawler/extractor/CasExtractor.java new file mode 100644 index 0000000..0f9c445 --- /dev/null +++ b/src/main/java/org/codelibs/fess/crawler/extractor/CasExtractor.java @@ -0,0 +1,62 @@ +/* + * Copyright 2012-2024 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.extractor; + +import java.io.InputStream; +import java.util.Map; + +import javax.annotation.PostConstruct; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.codelibs.fess.MultiModalConstants; +import org.codelibs.fess.client.CasClient; +import org.codelibs.fess.crawler.entity.ExtractData; +import org.codelibs.fess.crawler.extractor.impl.TikaExtractor; +import org.codelibs.fess.ingest.EmbeddingIngester; +import org.codelibs.fess.util.EmbeddingUtil; + +public class CasExtractor extends TikaExtractor { + + private static final Logger logger = LogManager.getLogger(EmbeddingIngester.class); + + protected CasClient client; + + @Override + public int getWeight() { + return 10; + } + + @Override + @PostConstruct + public void init() { + super.init(); + + client = crawlerContainer.getComponent("casClient"); + } + + @Override + public ExtractData getText(final InputStream inputStream, final Map params) { + return getText(inputStream, params, (data, in) -> { + try { + data.putValue(MultiModalConstants.X_FESS_EMBEDDING, EmbeddingUtil.encodeFloatArray(client.getImageEmbedding(in))); + } catch (final Exception e) { + logger.warn("Failed to convert an image to a vector.", e); + } + }); + } + +} diff --git a/src/main/java/org/codelibs/fess/exception/CasAccessException.java b/src/main/java/org/codelibs/fess/exception/CasAccessException.java new file mode 100644 index 0000000..897d33a --- /dev/null +++ b/src/main/java/org/codelibs/fess/exception/CasAccessException.java @@ -0,0 +1,30 @@ +/* + * Copyright 2012-2024 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.exception; + +import org.codelibs.fess.crawler.exception.CrawlerSystemException; + +public class CasAccessException extends CrawlerSystemException { + private static final long serialVersionUID = 1L; + + public CasAccessException(final String message, final Throwable cause) { + super(message, cause); + } + + public CasAccessException(final String message) { + super(message); + } +} diff --git a/src/main/java/org/codelibs/fess/ingest/EmbeddingIngester.java b/src/main/java/org/codelibs/fess/ingest/EmbeddingIngester.java new file mode 100644 index 0000000..63f78ca --- /dev/null +++ b/src/main/java/org/codelibs/fess/ingest/EmbeddingIngester.java @@ -0,0 +1,42 @@ +package org.codelibs.fess.ingest; + +import java.util.Map; + +import javax.annotation.PostConstruct; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.codelibs.core.lang.StringUtil; +import org.codelibs.fess.Constants; +import org.codelibs.fess.MultiModalConstants; +import org.codelibs.fess.util.ComponentUtil; +import org.codelibs.fess.util.EmbeddingUtil; + +public class EmbeddingIngester extends Ingester { + private static final Logger logger = LogManager.getLogger(EmbeddingIngester.class); + + protected String embeddingField; + + @PostConstruct + public void init() { + embeddingField = System.getProperty("clip.index.embedding_field", "content_vector"); + + ComponentUtil.getFessConfig().addCrawlerMetadataNameMapping(MultiModalConstants.X_FESS_EMBEDDING, embeddingField, + Constants.MAPPING_TYPE_ARRAY, StringUtil.EMPTY); + } + + @Override + protected Map process(final Map target) { + if (target.containsKey(embeddingField)) { + logger.debug("[{}] : {}", embeddingField, target); + if (target.get(embeddingField) instanceof final String[] encodedEmbeddings) { + final float[] embedding = EmbeddingUtil.decodeFloatArray(encodedEmbeddings[0]); + logger.debug("embedding:{}", embedding); + target.put(embeddingField, embedding); + } else { + logger.warn("{} is not an array.", embeddingField); + } + } + return target; + } +} diff --git a/src/main/java/org/codelibs/fess/util/EmbeddingUtil.java b/src/main/java/org/codelibs/fess/util/EmbeddingUtil.java new file mode 100644 index 0000000..f4e1c12 --- /dev/null +++ b/src/main/java/org/codelibs/fess/util/EmbeddingUtil.java @@ -0,0 +1,45 @@ +/* + * Copyright 2012-2024 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.util; + +import java.nio.ByteBuffer; +import java.util.Base64; + +public class EmbeddingUtil { + + private EmbeddingUtil() { + // nothing + } + + public static String encodeFloatArray(final float[] floatArray) { + final ByteBuffer byteBuffer = ByteBuffer.allocate(floatArray.length * 4); + for (final float value : floatArray) { + byteBuffer.putFloat(value); + } + return Base64.getEncoder().encodeToString(byteBuffer.array()); + } + + public static float[] decodeFloatArray(final String encodedString) { + final byte[] bytes = Base64.getDecoder().decode(encodedString); + final ByteBuffer byteBuffer = ByteBuffer.wrap(bytes); + final float[] floatArray = new float[bytes.length / 4]; + for (int i = 0; i < floatArray.length; i++) { + floatArray[i] = byteBuffer.getFloat(); + } + return floatArray; + } + +} diff --git a/src/main/resources/app++.xml b/src/main/resources/app++.xml new file mode 100644 index 0000000..ac3da29 --- /dev/null +++ b/src/main/resources/app++.xml @@ -0,0 +1,8 @@ + + + + + + diff --git a/src/main/resources/crawler/extractor++.xml b/src/main/resources/crawler/extractor++.xml new file mode 100644 index 0000000..61f0228 --- /dev/null +++ b/src/main/resources/crawler/extractor++.xml @@ -0,0 +1,18 @@ + + + + + + + + [ + "image/gif", + "image/jpeg", + "image/png", + ] + + + + diff --git a/src/main/resources/fess_ingest++.xml b/src/main/resources/fess_ingest++.xml new file mode 100644 index 0000000..34fbca0 --- /dev/null +++ b/src/main/resources/fess_ingest++.xml @@ -0,0 +1,8 @@ + + + + + + + diff --git a/src/test/java/org/codelibs/fess/client/CasClientTest.java b/src/test/java/org/codelibs/fess/client/CasClientTest.java new file mode 100644 index 0000000..742110c --- /dev/null +++ b/src/test/java/org/codelibs/fess/client/CasClientTest.java @@ -0,0 +1,49 @@ +/* + * Copyright 2012-2024 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.client; + +import java.io.InputStream; +import java.util.logging.Logger; + +import org.codelibs.core.io.ResourceUtil; +import org.codelibs.curl.CurlException; +import org.codelibs.fess.crawler.extractor.CasExtractorTest; +import org.dbflute.utflute.core.PlainTestCase; + +public class CasClientTest extends PlainTestCase { + static final Logger logger = Logger.getLogger(CasExtractorTest.class.getName()); + + public void test_encodeImage() throws Exception { + CasClient client = new CasClient(); + client.init(); + try (InputStream in = ResourceUtil.getResourceAsStream("images/codelibs_cover.jpeg")) { + String data = client.encodeImage(in); + assertEquals(70804, data.length()); + // FileUtil.writeBytes("test.png", Base64.getDecoder().decode(data)); + } + } + + public void test_getImageEmbedding() throws Exception { + CasClient client = new CasClient(); + client.init(); + try (InputStream in = ResourceUtil.getResourceAsStream("images/codelibs_cover.jpeg")) { + float[] embedding = client.getImageEmbedding(in); + assertEquals(512, embedding.length); + } catch (CurlException e) { + logger.warning(e.getMessage()); + } + } +} \ No newline at end of file diff --git a/src/test/java/org/codelibs/fess/crawler/extractor/CasExtractorTest.java b/src/test/java/org/codelibs/fess/crawler/extractor/CasExtractorTest.java new file mode 100644 index 0000000..fdd2f25 --- /dev/null +++ b/src/test/java/org/codelibs/fess/crawler/extractor/CasExtractorTest.java @@ -0,0 +1,71 @@ +/* + * Copyright 2012-2024 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.crawler.extractor; + +import java.io.InputStream; +import java.util.logging.Logger; + +import org.codelibs.core.io.CloseableUtil; +import org.codelibs.core.io.ResourceUtil; +import org.codelibs.fess.MultiModalConstants; +import org.codelibs.fess.client.CasClient; +import org.codelibs.fess.crawler.container.StandardCrawlerContainer; +import org.codelibs.fess.crawler.entity.ExtractData; +import org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl; +import org.codelibs.fess.util.EmbeddingUtil; +import org.dbflute.utflute.core.PlainTestCase; + +/** + * @author shinsuke + * + */ +public class CasExtractorTest extends PlainTestCase { + static final Logger logger = Logger.getLogger(CasExtractorTest.class.getName()); + + public CasExtractor casExtractor; + + @Override + protected void setUp() throws Exception { + super.setUp(); + + StandardCrawlerContainer container = new StandardCrawlerContainer(); + container// + .singleton("mimeTypeHelper", MimeTypeHelperImpl.class)// + .singleton("casExtractor", CasExtractor.class)// + .singleton("casClient", new CasClient() { + @Override + public float[] getImageEmbedding(final InputStream in) { + return new float[] { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f }; + } + })// + ; + + casExtractor = container.getComponent("casExtractor"); + casExtractor.init(); + } + + public void test_getTika() { + final InputStream in = ResourceUtil.getResourceAsStream("images/codelibs_cover.jpeg"); + final ExtractData extractData = casExtractor.getText(in, null); + final String content = extractData.getContent(); + CloseableUtil.closeQuietly(in); + assertEquals(0, content.length()); + String[] values = extractData.getValues(MultiModalConstants.X_FESS_EMBEDDING); + assertEquals(1, values.length); + float[] embedding = EmbeddingUtil.decodeFloatArray(values[0]); + assertEquals(5, embedding.length); + } +} diff --git a/src/test/java/org/codelibs/fess/util/EmbeddingUtilTest.java b/src/test/java/org/codelibs/fess/util/EmbeddingUtilTest.java new file mode 100644 index 0000000..7e45d39 --- /dev/null +++ b/src/test/java/org/codelibs/fess/util/EmbeddingUtilTest.java @@ -0,0 +1,35 @@ +/* + * Copyright 2012-2024 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.util; + +import org.codelibs.fess.util.EmbeddingUtil; +import org.dbflute.utflute.core.PlainTestCase; + +public class EmbeddingUtilTest extends PlainTestCase { + + public void test_encodeFloatArray() { + float[] array = new float[] { 1.0f, 2.0f, 3.0f }; + assertEquals("P4AAAEAAAABAQAAA", EmbeddingUtil.encodeFloatArray(array)); + } + + public void test_decodeFloatArray() { + float[] array = EmbeddingUtil.decodeFloatArray("P4AAAEAAAABAQAAA"); + assertEquals(3, array.length); + assertEquals(1.0f, array[0]); + assertEquals(2.0f, array[1]); + assertEquals(3.0f, array[2]); + } +} diff --git a/src/test/resources/images/codelibs_cover.jpeg b/src/test/resources/images/codelibs_cover.jpeg new file mode 100644 index 0000000..9e3c66b Binary files /dev/null and b/src/test/resources/images/codelibs_cover.jpeg differ