Skip to content

Commit

Permalink
Further updates from code review - added clarity around encoding para…
Browse files Browse the repository at this point in the history
…meters
  • Loading branch information
drewfarris committed Sep 27, 2023
1 parent 9c5120a commit 4de0652
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import com.google.common.collect.Multimap;

import datawave.query.util.ssdeep.BucketAccumuloKeyGenerator;
import datawave.query.util.ssdeep.ChunkSizeEncoding;
import datawave.query.util.ssdeep.IntegerEncoding;
import datawave.query.util.ssdeep.NGramTuple;
Expand All @@ -17,15 +18,12 @@

public class SSDeepSimilarityQueryConfiguration extends GenericQueryConfiguration {

int indexBuckets = 32;

int queryThreads = 100;

int maxRepeatedCharacters = 3;

int bucketEncodingBase = 32;

int bucketEncodingLength = 2;
int indexBuckets = BucketAccumuloKeyGenerator.DEFAULT_BUCKET_COUNT;
int bucketEncodingBase = BucketAccumuloKeyGenerator.DEFAULT_BUCKET_ENCODING_BASE;
int bucketEncodingLength = BucketAccumuloKeyGenerator.DEFAULT_BUCKET_ENCODING_LENGTH;

/** Used to encode buckets as characters which are prepended to the ranges used to retrieve ngram tuples */
private IntegerEncoding bucketEncoder;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,6 @@ public class SSDeepSimilarityQueryTransformer extends BaseQueryLogicTransformer<

private static final Logger log = Logger.getLogger(SSDeepSimilarityQueryTransformer.class);

/** The number of characters in the bucket encoding alphabet */
public static final int BUCKET_ENCODING_BASE = 32;
/** The length of the bucket encoding we will perform */
public static final int BUCKET_ENCODING_LENGTH = 2;

protected final Authorizations auths;

protected final ResponseObjectFactory responseObjectFactory;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@ public class BucketAccumuloKeyGenerator {
public static final byte[] EMPTY_BYTES = new byte[0];
public static final Value EMPTY_VALUE = new Value();

public static final int DEFAULT_BUCKET_COUNT = 32;

/** The number of characters in the bucket encoding alphabet */
public static final int BUCKET_ENCODING_BASE = 32;
public static final int DEFAULT_BUCKET_ENCODING_BASE = 32;
/** The length of the bucket encoding we will perform */
public static final int BUCKET_ENCODING_LENGTH = 2;
public static final int DEFAULT_BUCKET_ENCODING_LENGTH = 2;

/** The maximum number of buckets we will partition data into */
final int bucketCount;
Expand All @@ -44,14 +46,18 @@ public class BucketAccumuloKeyGenerator {
final long timestamp = 0;

/**
* Creates a BucketAccumuloKeyGenerator with the specified bucket count
* Creates a BucketAccumuloKeyGenerator with the specified bucket count and encoding properties
*
* @param bucketCount
* the number of potential buckets into which we will partition data.
* the number of index buckets (partitions) that will be used.
* @param bucketEncodingBase
* the size of the alphabet that will be used to encode the index bucket number in the key.
* @param bucketEncodingLength
* the number of characters that will be used to encode the index bucket number.
*/
public BucketAccumuloKeyGenerator(int bucketCount) {
public BucketAccumuloKeyGenerator(int bucketCount, int bucketEncodingBase, int bucketEncodingLength) {
this.bucketCount = bucketCount;
this.bucketEncoding = new IntegerEncoding(BUCKET_ENCODING_BASE, BUCKET_ENCODING_LENGTH);
this.bucketEncoding = new IntegerEncoding(bucketEncodingBase, bucketEncodingLength);
this.chunkEncoding = new ChunkSizeEncoding();
this.ngramEncoding = new SSDeepEncoding();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

// @formatter:off

/** The encoder exploits the fact that there is a small number of legal chunk sizes based on the minimum chunk size.
/** The encoder exploits the fact that there are a small number of legal chunk sizes based on the minimum chunk size.
* It introduces the concept of a chunkIndex, a number that is considerably smaller than the chunk size itself, and
* represents the magnitude of the chunkSize such that:
* <p>
Expand All @@ -27,22 +27,28 @@
//@formatter:on
public class ChunkSizeEncoding implements Serializable {

static final int MIN_CHUNK_SIZE = 3;
static final int SPAM_SUM_LENGTH = 64;
private static final int MIN_CHUNK_SIZE = 3;
private static final int DEFAULT_ENCODING_ALPHABET_LENGTH = HashReverse.LEXICAL_B64_TABLE.length;

private static final int DEFAULT_ENCODING_LENGTH = 1;

static final double L2 = Math.log(2);

private final IntegerEncoding chunkIndexEncoding;

final int minChunkSize;

/**
* Create a ChunkSizeEncoding with the default parameters of a 64 character encoding alphabet and a length of 1. This allows us to encode 64 distinct chunk
* index values. Chunk index 0 represents the MIN_CHUNK_SIZE. See class javadocs for more info.
*/
public ChunkSizeEncoding() {
this(MIN_CHUNK_SIZE, SPAM_SUM_LENGTH, 1);
this(MIN_CHUNK_SIZE, DEFAULT_ENCODING_ALPHABET_LENGTH, DEFAULT_ENCODING_LENGTH);
}

public ChunkSizeEncoding(int minChunkSize, int spamSumLength, int encodingLength) {
public ChunkSizeEncoding(int minChunkSize, int encodingAlphabetLength, int encodingLength) {
this.minChunkSize = minChunkSize;
this.chunkIndexEncoding = new IntegerEncoding(spamSumLength, encodingLength);
this.chunkIndexEncoding = new IntegerEncoding(encodingAlphabetLength, encodingLength);
}

public long getLimit() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,25 @@
*/
public class IntegerEncoding implements Serializable {

// The number of distinct characters used for encoding
final int base;
// the target length of the encoding
final int length;
// the max integer value we can encode, derived from the base and length parameters.
final int limit;

/**
* We are using the LEXICAL_B64_TABLE to encode integers to characters, our max base (the unique characters we use for encoding) is based on the size of
* this alphabet.
*/
private static final int MAX_BASE = HashReverse.LEXICAL_B64_TABLE.length;

/**
* Create an unsigned integer encoder that uses the specified base (up to 64) and length (which can't generate numbers larger than Integer.MAX_VALUE). This
* uses the lexically sorted Base 64 alphabet for encoding.
*
* @param base
* base for encoding, must be larger than 2, less than 64.
* base for encoding, this is the number of distinct characters that will be used to encode integers must be larger than 2, less than 64.
* @param length
* the length (in bytes) of the final encoding produced by this encoding
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,11 @@

import static org.junit.Assert.fail;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.stream.Stream;

Expand Down Expand Up @@ -54,7 +51,6 @@
import datawave.webservice.query.result.event.FieldBase;
import datawave.webservice.query.runner.RunningQuery;
import datawave.webservice.result.EventQueryResponseBase;
import it.unimi.dsi.fastutil.Hash;

public class SSDeepQueryTest {

Expand All @@ -74,19 +70,22 @@ public class SSDeepQueryTest {

protected DatawavePrincipal principal;

public static final int BUCKET_COUNT = BucketAccumuloKeyGenerator.DEFAULT_BUCKET_COUNT;
public static final int BUCKET_ENCODING_BASE = BucketAccumuloKeyGenerator.DEFAULT_BUCKET_ENCODING_BASE;
public static final int BUCKET_ENCODING_LENGTH = BucketAccumuloKeyGenerator.DEFAULT_BUCKET_ENCODING_LENGTH;

public static void indexSSDeepTestData(AccumuloClient accumuloClient) throws Exception {
// configuration
String ssdeepTableName = "ssdeepIndex";
int ngramSize = 7;
int indexBuckets = 32;
int minHashSize = 3;

// input
Stream<String> ssdeepLines = Stream.of(TEST_SSDEEPS);

// processing
final NGramByteHashGenerator nGramGenerator = new NGramByteHashGenerator(ngramSize, indexBuckets, minHashSize);
final BucketAccumuloKeyGenerator accumuloKeyGenerator = new BucketAccumuloKeyGenerator(indexBuckets);
final NGramByteHashGenerator nGramGenerator = new NGramByteHashGenerator(ngramSize, BUCKET_COUNT, minHashSize);
final BucketAccumuloKeyGenerator accumuloKeyGenerator = new BucketAccumuloKeyGenerator(BUCKET_COUNT, BUCKET_ENCODING_BASE, BUCKET_ENCODING_LENGTH);

// output
BatchWriterConfig batchWriterConfig = new BatchWriterConfig();
Expand Down Expand Up @@ -149,14 +148,17 @@ private static void logSSDeepTestData(String tableName) throws TableNotFoundExce

@Before
public void setUpQuery() {
this.logic = new SSDeepSimilarityQueryLogic();
this.logic.setTableName("ssdeepIndex");
this.logic.setMarkingFunctions(new MarkingFunctions.Default());
this.logic.setResponseObjectFactory(new DefaultResponseObjectFactory());
logic = new SSDeepSimilarityQueryLogic();
logic.setTableName("ssdeepIndex");
logic.setMarkingFunctions(new MarkingFunctions.Default());
logic.setResponseObjectFactory(new DefaultResponseObjectFactory());
logic.setBucketEncodingBase(BUCKET_ENCODING_BASE);
logic.setBucketEncodingLength(BUCKET_ENCODING_LENGTH);
logic.setIndexBuckets(BUCKET_COUNT);

SubjectIssuerDNPair dn = SubjectIssuerDNPair.of("userDn", "issuerDn");
DatawaveUser user = new DatawaveUser(dn, DatawaveUser.UserType.USER, Sets.newHashSet(auths.toString().split(",")), null, null, -1L);
this.principal = new DatawavePrincipal(Collections.singleton(user));
principal = new DatawavePrincipal(Collections.singleton(user));
}

@Test
Expand Down

0 comments on commit 4de0652

Please sign in to comment.