Further updates from code review - added clarity around encoding para…

…meters
NationalSecurityAgency · Sep 27, 2023 · 4de0652 · 4de0652
1 parent 9c5120a
commit 4de0652
Show file tree

Hide file tree

Showing 6 changed files with 52 additions and 36 deletions.
diff --git a/...se/query-core/src/main/java/datawave/query/config/SSDeepSimilarityQueryConfiguration.java b/...se/query-core/src/main/java/datawave/query/config/SSDeepSimilarityQueryConfiguration.java
@@ -6,6 +6,7 @@
 
 import com.google.common.collect.Multimap;
 
+import datawave.query.util.ssdeep.BucketAccumuloKeyGenerator;
 import datawave.query.util.ssdeep.ChunkSizeEncoding;
 import datawave.query.util.ssdeep.IntegerEncoding;
 import datawave.query.util.ssdeep.NGramTuple;
@@ -17,15 +18,12 @@
 
 public class SSDeepSimilarityQueryConfiguration extends GenericQueryConfiguration {
 
-    int indexBuckets = 32;
-
     int queryThreads = 100;
-
     int maxRepeatedCharacters = 3;
 
-    int bucketEncodingBase = 32;
-
-    int bucketEncodingLength = 2;
+    int indexBuckets = BucketAccumuloKeyGenerator.DEFAULT_BUCKET_COUNT;
+    int bucketEncodingBase = BucketAccumuloKeyGenerator.DEFAULT_BUCKET_ENCODING_BASE;
+    int bucketEncodingLength = BucketAccumuloKeyGenerator.DEFAULT_BUCKET_ENCODING_LENGTH;
 
     /** Used to encode buckets as characters which are prepended to the ranges used to retrieve ngram tuples */
     private IntegerEncoding bucketEncoder;

diff --git a/...query-core/src/main/java/datawave/query/transformer/SSDeepSimilarityQueryTransformer.java b/...query-core/src/main/java/datawave/query/transformer/SSDeepSimilarityQueryTransformer.java
@@ -35,11 +35,6 @@ public class SSDeepSimilarityQueryTransformer extends BaseQueryLogicTransformer<
 
     private static final Logger log = Logger.getLogger(SSDeepSimilarityQueryTransformer.class);
 
-    /** The number of characters in the bucket encoding alphabet */
-    public static final int BUCKET_ENCODING_BASE = 32;
-    /** The length of the bucket encoding we will perform */
-    public static final int BUCKET_ENCODING_LENGTH = 2;
-
     protected final Authorizations auths;
 
     protected final ResponseObjectFactory responseObjectFactory;

diff --git a/...house/query-core/src/main/java/datawave/query/util/ssdeep/BucketAccumuloKeyGenerator.java b/...house/query-core/src/main/java/datawave/query/util/ssdeep/BucketAccumuloKeyGenerator.java
@@ -27,10 +27,12 @@ public class BucketAccumuloKeyGenerator {
     public static final byte[] EMPTY_BYTES = new byte[0];
     public static final Value EMPTY_VALUE = new Value();
 
+    public static final int DEFAULT_BUCKET_COUNT = 32;
+
     /** The number of characters in the bucket encoding alphabet */
-    public static final int BUCKET_ENCODING_BASE = 32;
+    public static final int DEFAULT_BUCKET_ENCODING_BASE = 32;
     /** The length of the bucket encoding we will perform */
-    public static final int BUCKET_ENCODING_LENGTH = 2;
+    public static final int DEFAULT_BUCKET_ENCODING_LENGTH = 2;
 
     /** The maximum number of buckets we will partition data into */
     final int bucketCount;
@@ -44,14 +46,18 @@ public class BucketAccumuloKeyGenerator {
     final long timestamp = 0;
 
     /**
-     * Creates a BucketAccumuloKeyGenerator with the specified bucket count
+     * Creates a BucketAccumuloKeyGenerator with the specified bucket count and encoding properties
      *
      * @param bucketCount
-     *            the number of potential buckets into which we will partition data.
+     *            the number of index buckets (partitions) that will be used.
+     * @param bucketEncodingBase
+     *            the size of the alphabet that will be used to encode the index bucket number in the key.
+     * @param bucketEncodingLength
+     *            the number of characters that will be used to encode the index bucket number.
      */
-    public BucketAccumuloKeyGenerator(int bucketCount) {
+    public BucketAccumuloKeyGenerator(int bucketCount, int bucketEncodingBase, int bucketEncodingLength) {
         this.bucketCount = bucketCount;
-        this.bucketEncoding = new IntegerEncoding(BUCKET_ENCODING_BASE, BUCKET_ENCODING_LENGTH);
+        this.bucketEncoding = new IntegerEncoding(bucketEncodingBase, bucketEncodingLength);
         this.chunkEncoding = new ChunkSizeEncoding();
         this.ngramEncoding = new SSDeepEncoding();
 

diff --git a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/ChunkSizeEncoding.java b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/ChunkSizeEncoding.java
@@ -4,7 +4,7 @@
 
 // @formatter:off
 
-/** The encoder exploits the fact that there is a small number of legal chunk sizes based on the minimum chunk size.
+/** The encoder exploits the fact that there are a small number of legal chunk sizes based on the minimum chunk size.
  *  It introduces the concept of a chunkIndex, a number that is considerably smaller than the chunk size itself, and
  *  represents the magnitude of the chunkSize such that:
  * <p>
@@ -27,22 +27,28 @@
 //@formatter:on
 public class ChunkSizeEncoding implements Serializable {
 
-    static final int MIN_CHUNK_SIZE = 3;
-    static final int SPAM_SUM_LENGTH = 64;
+    private static final int MIN_CHUNK_SIZE = 3;
+    private static final int DEFAULT_ENCODING_ALPHABET_LENGTH = HashReverse.LEXICAL_B64_TABLE.length;
+
+    private static final int DEFAULT_ENCODING_LENGTH = 1;
 
     static final double L2 = Math.log(2);
 
     private final IntegerEncoding chunkIndexEncoding;
 
     final int minChunkSize;
 
+    /**
+     * Create a ChunkSizeEncoding with the default parameters of a 64 character encoding alphabet and a length of 1. This allows us to encode 64 distinct chunk
+     * index values. Chunk index 0 represents the MIN_CHUNK_SIZE. See class javadocs for more info.
+     */
     public ChunkSizeEncoding() {
-        this(MIN_CHUNK_SIZE, SPAM_SUM_LENGTH, 1);
+        this(MIN_CHUNK_SIZE, DEFAULT_ENCODING_ALPHABET_LENGTH, DEFAULT_ENCODING_LENGTH);
     }
 
-    public ChunkSizeEncoding(int minChunkSize, int spamSumLength, int encodingLength) {
+    public ChunkSizeEncoding(int minChunkSize, int encodingAlphabetLength, int encodingLength) {
         this.minChunkSize = minChunkSize;
-        this.chunkIndexEncoding = new IntegerEncoding(spamSumLength, encodingLength);
+        this.chunkIndexEncoding = new IntegerEncoding(encodingAlphabetLength, encodingLength);
     }
 
     public long getLimit() {

diff --git a/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/IntegerEncoding.java b/warehouse/query-core/src/main/java/datawave/query/util/ssdeep/IntegerEncoding.java
@@ -8,16 +8,25 @@
  */
 public class IntegerEncoding implements Serializable {
 
+    // The number of distinct characters used for encoding
     final int base;
+    // the target length of the encoding
     final int length;
+    // the max integer value we can encode, derived from the base and length parameters.
     final int limit;
 
+    /**
+     * We are using the LEXICAL_B64_TABLE to encode integers to characters, our max base (the unique characters we use for encoding) is based on the size of
+     * this alphabet.
+     */
+    private static final int MAX_BASE = HashReverse.LEXICAL_B64_TABLE.length;
+
     /**
      * Create an unsigned integer encoder that uses the specified base (up to 64) and length (which can't generate numbers larger than Integer.MAX_VALUE). This
      * uses the lexically sorted Base 64 alphabet for encoding.
      *
      * @param base
-     *            base for encoding, must be larger than 2, less than 64.
+     *            base for encoding, this is the number of distinct characters that will be used to encode integers must be larger than 2, less than 64.
      * @param length
      *            the length (in bytes) of the final encoding produced by this encoding
      */

diff --git a/warehouse/query-core/src/test/java/datawave/query/SSDeepQueryTest.java b/warehouse/query-core/src/test/java/datawave/query/SSDeepQueryTest.java
@@ -2,14 +2,11 @@
 
 import static org.junit.Assert.fail;
 
-import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
-import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
-import java.util.Set;
 import java.util.UUID;
 import java.util.stream.Stream;
 
@@ -54,7 +51,6 @@
 import datawave.webservice.query.result.event.FieldBase;
 import datawave.webservice.query.runner.RunningQuery;
 import datawave.webservice.result.EventQueryResponseBase;
-import it.unimi.dsi.fastutil.Hash;
 
 public class SSDeepQueryTest {
 
@@ -74,19 +70,22 @@ public class SSDeepQueryTest {
 
     protected DatawavePrincipal principal;
 
+    public static final int BUCKET_COUNT = BucketAccumuloKeyGenerator.DEFAULT_BUCKET_COUNT;
+    public static final int BUCKET_ENCODING_BASE = BucketAccumuloKeyGenerator.DEFAULT_BUCKET_ENCODING_BASE;
+    public static final int BUCKET_ENCODING_LENGTH = BucketAccumuloKeyGenerator.DEFAULT_BUCKET_ENCODING_LENGTH;
+
     public static void indexSSDeepTestData(AccumuloClient accumuloClient) throws Exception {
         // configuration
         String ssdeepTableName = "ssdeepIndex";
         int ngramSize = 7;
-        int indexBuckets = 32;
         int minHashSize = 3;
 
         // input
         Stream<String> ssdeepLines = Stream.of(TEST_SSDEEPS);
 
         // processing
-        final NGramByteHashGenerator nGramGenerator = new NGramByteHashGenerator(ngramSize, indexBuckets, minHashSize);
-        final BucketAccumuloKeyGenerator accumuloKeyGenerator = new BucketAccumuloKeyGenerator(indexBuckets);
+        final NGramByteHashGenerator nGramGenerator = new NGramByteHashGenerator(ngramSize, BUCKET_COUNT, minHashSize);
+        final BucketAccumuloKeyGenerator accumuloKeyGenerator = new BucketAccumuloKeyGenerator(BUCKET_COUNT, BUCKET_ENCODING_BASE, BUCKET_ENCODING_LENGTH);
 
         // output
         BatchWriterConfig batchWriterConfig = new BatchWriterConfig();
@@ -149,14 +148,17 @@ private static void logSSDeepTestData(String tableName) throws TableNotFoundExce
 
     @Before
     public void setUpQuery() {
-        this.logic = new SSDeepSimilarityQueryLogic();
-        this.logic.setTableName("ssdeepIndex");
-        this.logic.setMarkingFunctions(new MarkingFunctions.Default());
-        this.logic.setResponseObjectFactory(new DefaultResponseObjectFactory());
+        logic = new SSDeepSimilarityQueryLogic();
+        logic.setTableName("ssdeepIndex");
+        logic.setMarkingFunctions(new MarkingFunctions.Default());
+        logic.setResponseObjectFactory(new DefaultResponseObjectFactory());
+        logic.setBucketEncodingBase(BUCKET_ENCODING_BASE);
+        logic.setBucketEncodingLength(BUCKET_ENCODING_LENGTH);
+        logic.setIndexBuckets(BUCKET_COUNT);
 
         SubjectIssuerDNPair dn = SubjectIssuerDNPair.of("userDn", "issuerDn");
         DatawaveUser user = new DatawaveUser(dn, DatawaveUser.UserType.USER, Sets.newHashSet(auths.toString().split(",")), null, null, -1L);
-        this.principal = new DatawavePrincipal(Collections.singleton(user));
+        principal = new DatawavePrincipal(Collections.singleton(user));
     }
 
     @Test