-
Notifications
You must be signed in to change notification settings - Fork 47
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'text-splitter' of https://github.com/AyoTheDev/langtorch …
…into AyoTheDev-text-splitter
- Loading branch information
Showing
21 changed files
with
285 additions
and
64 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
6 changes: 3 additions & 3 deletions
6
src/main/java/ai/knowly/langtorch/capability/module/openai/SimpleChatCapability.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
...rch/parser/ChatMessageToStringParser.java → ...ing/parser/ChatMessageToStringParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
...va/ai/knowly/langtorch/parser/Parser.java → ...angtorch/preprocessing/parser/Parser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
...ser/PromptTemplateToSingleTextParser.java → ...ser/PromptTemplateToSingleTextParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
...orch/parser/SingleTextToStringParser.java → ...sing/parser/SingleTextToStringParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
...arser/StringToMultiChatMessageParser.java → ...arser/StringToMultiChatMessageParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
...orch/parser/StringToSingleTextParser.java → ...sing/parser/StringToSingleTextParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
9 changes: 9 additions & 0 deletions
9
src/main/java/ai/knowly/langtorch/preprocessing/splitter/text/SplitterOption.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
package ai.knowly.langtorch.preprocessing.splitter.text; | ||
|
||
public abstract class SplitterOption { | ||
String text; | ||
|
||
protected SplitterOption(String text) { | ||
this.text = text; | ||
} | ||
} |
7 changes: 7 additions & 0 deletions
7
src/main/java/ai/knowly/langtorch/preprocessing/splitter/text/TextSplitter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
package ai.knowly.langtorch.preprocessing.splitter.text; | ||
|
||
import java.util.List; | ||
|
||
public interface TextSplitter<S extends SplitterOption> { | ||
List<String> splitText(S option); | ||
} |
65 changes: 65 additions & 0 deletions
65
src/main/java/ai/knowly/langtorch/preprocessing/splitter/text/word/WordSplitter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
package ai.knowly.langtorch.preprocessing.splitter.text.word; | ||
|
||
import ai.knowly.langtorch.preprocessing.splitter.text.TextSplitter; | ||
import com.google.common.collect.ImmutableList; | ||
import com.google.common.collect.ImmutableList.Builder; | ||
import java.util.List; | ||
|
||
/** Splits text into chunks of words. */ | ||
public class WordSplitter implements TextSplitter<WordSplitterOption> { | ||
|
||
public static WordSplitter create() { | ||
return new WordSplitter(); | ||
} | ||
|
||
@Override | ||
public List<String> splitText(WordSplitterOption option) { | ||
int maxLengthPerChunk = option.getMaxLengthPerChunk(); | ||
String text = option.getText(); | ||
|
||
Builder<String> chunks = ImmutableList.builder(); | ||
|
||
// Validate the maxLengthPerChunk | ||
if (maxLengthPerChunk < 1) { | ||
throw new IllegalArgumentException("maxLengthPerChunk should be greater than 0"); | ||
} | ||
|
||
String[] words = text.split("\\s+"); | ||
int minLengthOfWord = words[0].length(); | ||
|
||
for (String word : words) { | ||
minLengthOfWord = Math.min(minLengthOfWord, word.length()); | ||
} | ||
|
||
if (maxLengthPerChunk < minLengthOfWord) { | ||
throw new IllegalArgumentException( | ||
"maxLengthPerChunk is smaller than the smallest word in the string"); | ||
} | ||
|
||
StringBuilder chunk = new StringBuilder(); | ||
int wordsLength = words.length; | ||
|
||
for (int i = 0; i < wordsLength; i++) { | ||
String word = words[i]; | ||
boolean isLastWord = i == wordsLength - 1; | ||
if ((chunk.length() + word.length() + (isLastWord ? 0 : 1)) | ||
<= maxLengthPerChunk) { // '+1' accounts for spaces, except for the last word | ||
chunk.append(word); | ||
if (!isLastWord) { | ||
chunk.append(" "); | ||
} | ||
} else { | ||
chunks.add(chunk.toString().trim()); | ||
chunk = new StringBuilder(); | ||
chunk.append(word).append(" "); | ||
} | ||
} | ||
|
||
// Add remaining chunk if any | ||
if (chunk.length() > 0) { | ||
chunks.add(chunk.toString().trim()); | ||
} | ||
|
||
return chunks.build(); | ||
} | ||
} |
28 changes: 28 additions & 0 deletions
28
src/main/java/ai/knowly/langtorch/preprocessing/splitter/text/word/WordSplitterOption.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
package ai.knowly.langtorch.preprocessing.splitter.text.word; | ||
|
||
import ai.knowly.langtorch.preprocessing.splitter.text.SplitterOption; | ||
import lombok.Builder; | ||
import lombok.Data; | ||
import lombok.EqualsAndHashCode; | ||
|
||
/** Options for {@link WordSplitter}. */ | ||
@EqualsAndHashCode(callSuper = true) | ||
@Data | ||
@Builder(toBuilder = true, setterPrefix = "set") | ||
public class WordSplitterOption extends SplitterOption { | ||
// Unprocessed text. | ||
private final String text; | ||
|
||
// The max length of a chunk. | ||
private final int maxLengthPerChunk; | ||
|
||
private WordSplitterOption(String text, int maxLengthPerChunk) { | ||
super(text); | ||
this.text = text; | ||
this.maxLengthPerChunk = maxLengthPerChunk; | ||
} | ||
|
||
public static WordSplitterOption of(String text, int totalLengthOfChunk) { | ||
return new WordSplitterOption(text, totalLengthOfChunk); | ||
} | ||
} |
23 changes: 23 additions & 0 deletions
23
src/main/java/ai/knowly/langtorch/schema/io/DomainDocument.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
package ai.knowly.langtorch.schema.io; | ||
|
||
import java.util.Optional; | ||
|
||
public class DomainDocument implements Input, Output { | ||
|
||
private final String pageContent; | ||
|
||
private final Optional<Metadata> metadata; | ||
|
||
public DomainDocument(String pageContent, Optional<Metadata> metadata) { | ||
this.pageContent = pageContent; | ||
this.metadata = metadata; | ||
} | ||
|
||
public String getPageContent() { | ||
return pageContent; | ||
} | ||
|
||
public Optional<Metadata> getMetadata() { | ||
return metadata; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
package ai.knowly.langtorch.schema.io; | ||
|
||
import org.apache.commons.collections4.keyvalue.MultiKey; | ||
import org.apache.commons.collections4.map.MultiKeyMap; | ||
|
||
import java.util.Objects; | ||
|
||
public class Metadata { | ||
private final MultiKeyMap<String, String> value; | ||
|
||
public Metadata(MultiKeyMap<String, String> values) { | ||
this.value = values; | ||
} | ||
|
||
public MultiKeyMap<String, String> getValue() { | ||
return value; | ||
} | ||
|
||
public static Metadata create(){ | ||
return new Metadata(new MultiKeyMap<>()); | ||
} | ||
|
||
public Metadata set(MultiKey<String> key, String value) { | ||
this.value.put(key, value); | ||
return this; | ||
} | ||
|
||
public static Metadata copyOf(MultiKeyMap<String, String> values) { | ||
return new Metadata(values); | ||
} | ||
|
||
@Override | ||
public boolean equals(Object obj) { | ||
if (this == obj) { | ||
return true; | ||
} | ||
if (obj == null || getClass() != obj.getClass()) { | ||
return false; | ||
} | ||
Metadata other = (Metadata) obj; | ||
return Objects.equals(value, other.value); | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
return Objects.hash(value); | ||
} | ||
} |
2 changes: 1 addition & 1 deletion
2
...PromptTemplateToSingleTextParserTest.java → ...PromptTemplateToSingleTextParserTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
85 changes: 85 additions & 0 deletions
85
src/test/java/ai/knowly/langtorch/preprocessing/splitter/text/word/WordSplitterTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
package ai.knowly.langtorch.preprocessing.splitter.text.word; | ||
|
||
import static com.google.common.truth.Truth.assertThat; | ||
import static org.junit.jupiter.api.Assertions.assertThrows; | ||
|
||
import java.util.List; | ||
import org.junit.jupiter.api.Test; | ||
|
||
class WordSplitterTest { | ||
@Test | ||
void testSplitText_NormalUsage() { | ||
// Arrange. | ||
WordSplitterOption option = | ||
WordSplitterOption.builder() | ||
.setText("Hello world, this is a test.") | ||
.setMaxLengthPerChunk(10) | ||
.build(); | ||
|
||
// Act. | ||
List<String> result = WordSplitter.create().splitText(option); | ||
|
||
// Assert. | ||
assertThat(result).containsExactly("Hello", "world,", "this is a", "test.").inOrder(); | ||
} | ||
|
||
@Test | ||
void testSplitText_SingleWord() { | ||
// Arrange. | ||
WordSplitterOption option = | ||
WordSplitterOption.builder().setText("Hello").setMaxLengthPerChunk(10).build(); | ||
|
||
// Act. | ||
List<String> result = WordSplitter.create().splitText(option); | ||
|
||
// Assert. | ||
assertThat(result).containsExactly("Hello"); | ||
} | ||
|
||
@Test | ||
void testSplitText_SingleChar() { | ||
// Arrange. | ||
WordSplitterOption option = | ||
WordSplitterOption.builder().setText("H").setMaxLengthPerChunk(1).build(); | ||
|
||
// Act. | ||
List<String> result = WordSplitter.create().splitText(option); | ||
|
||
// Assert. | ||
assertThat(result).containsExactly("H"); | ||
} | ||
|
||
void testSplitText_MaxLengthSmallerThanWordLength() { | ||
// Arrange. | ||
WordSplitterOption option = | ||
WordSplitterOption.builder().setText("Hello").setMaxLengthPerChunk(3).build(); | ||
|
||
// Act. | ||
// Assert. | ||
assertThrows(IllegalArgumentException.class, () -> WordSplitter.create().splitText(option)); | ||
} | ||
|
||
@Test | ||
void testSplitText_NegativeMaxLength() { | ||
// Arrange. | ||
WordSplitterOption option = | ||
WordSplitterOption.builder().setText("Hello").setMaxLengthPerChunk(-5).build(); | ||
|
||
// Act. | ||
// Assert. | ||
assertThrows(IllegalArgumentException.class, () -> WordSplitter.create().splitText(option)); | ||
} | ||
|
||
@Test | ||
void testSplitText_EmptyString() { | ||
// Arrange. | ||
WordSplitterOption option = | ||
WordSplitterOption.builder().setText("").setMaxLengthPerChunk(10).build(); | ||
|
||
// Act. | ||
List<String> result = WordSplitter.create().splitText(option); | ||
|
||
// Assert. | ||
assertThat(result).isEmpty(); | ||
} | ||
} |
Oops, something went wrong.