ingef · awildturtok · Aug 5, 2024 · Apr 10, 2024 · Apr 10, 2024 · Apr 11, 2024
diff --git a/backend/src/main/java/com/bakdata/conquery/commands/PreprocessorCommand.java b/backend/src/main/java/com/bakdata/conquery/commands/PreprocessorCommand.java
@@ -44,6 +44,7 @@
 import net.sourceforge.argparse4j.inf.ArgumentGroup;
 import net.sourceforge.argparse4j.inf.Namespace;
 import net.sourceforge.argparse4j.inf.Subparser;
+import org.jetbrains.annotations.NotNull;
 
 @Slf4j
 @FieldNameConstants
@@ -52,7 +53,7 @@ public class PreprocessorCommand extends ConqueryCommand {
 	private final List<String> failed = Collections.synchronizedList(new ArrayList<>());
 	private final List<String> success = Collections.synchronizedList(new ArrayList<>());
 	private ExecutorService pool;
-	private boolean isFailFast = false;
+	private boolean isFailFast;
 	private boolean isStrict = true;
 
 	public PreprocessorCommand() {
@@ -71,14 +72,14 @@ public static boolean requiresProcessing(PreprocessingJob preprocessingJob) {
 
 			log.info("EXISTS ALREADY");
 
-			int currentHash = preprocessingJob.getDescriptor()
-											  .calculateValidityHash(preprocessingJob.getCsvDirectory(), preprocessingJob.getTag());
+			final int currentHash = preprocessingJob.getDescriptor()
+													.calculateValidityHash(preprocessingJob.getCsvDirectory(), preprocessingJob.getTag());
 
 
 			final ObjectMapper om = Jackson.BINARY_MAPPER.copy();
 			try (final PreprocessedReader parser = new PreprocessedReader(new GZIPInputStream(new FileInputStream(preprocessingJob.getPreprocessedFile())), om)) {
 
-				PreprocessedHeader header = parser.readHeader();
+				final PreprocessedHeader header = parser.readHeader();
 
 				if (header.getValidityHash() == currentHash) {
 					log.info("\tHASH STILL VALID");
@@ -140,6 +141,11 @@ public void configure(Subparser subparser) {
 			 .setDefault(true)
 			 .help("Escalate missing files to errors.");
 
+		group.addArgument("--buckets")
+			 .type(Integer.class)
+			 .setDefault(100)
+			 .help("Number of buckets to use for id-hashing. This value is required to be a constant per-dataset.");
+
 	}
 
 	@Override
@@ -150,41 +156,48 @@ protected void run(Environment environment, Namespace namespace, ConqueryConfig
 
 		// Tag if present is appended to input-file csvs, output-file cqpp and used as id of cqpps
 
-		isFailFast = Optional.ofNullable(namespace.getBoolean("fast-fail")).orElse(false);
-		isStrict = Optional.ofNullable(namespace.getBoolean("strict")).orElse(true);
+		isFailFast = namespace.getBoolean("fast-fail");
+		isStrict = namespace.getBoolean("strict");
 
-		final List<String> tags = namespace.<String>getList("tag");
+		final List<String> tags = namespace.getList("tag");
 
 		final File inDir = namespace.get("in");
 		final File outDir = namespace.get("out");
-		final List<File> descriptionFiles = namespace.<File>getList("desc");
+		final List<File> descriptionFilesRoot = namespace.getList("desc");
+		final int buckets = namespace.getInt("buckets");
 
 
 		log.info("Preprocessing from command line config.");
 
-		final Collection<PreprocessingJob> jobs = new ArrayList<>();
+		final Collection<PreprocessingJob> jobs = collectJobs(descriptionFilesRoot, tags, inDir, outDir, environment);
 
-		if (tags == null || tags.isEmpty()) {
-			for (File desc : descriptionFiles) {
-				final List<PreprocessingJob> descriptions =
-						findPreprocessingDescriptions(desc, inDir, outDir, Optional.empty(), environment.getValidator());
-				jobs.addAll(descriptions);
-			}
+		final List<PreprocessingJob> broken = validateJobs(jobs, environment);
+
+		jobs.removeIf(Predicate.not(PreprocessorCommand::requiresProcessing));
+
+		preprocessJobs(jobs, buckets, config);
+
+
+		log.info("Successfully Preprocess {} Jobs:", success.size());
-		log.info("Successfully Preprocess {} Jobs:", success.size());
+		log.info("Successfully preprocessed {} jobs:", success.size());
+
-		log.info("Successfully Preprocess {} Jobs:", success.size());
+		log.info("Successfully preprocessed {} jobs:", success.size());
+
+		success.forEach(desc -> log.info("\tSucceeded Preprocessing for {}", desc));
+
+		if (!broken.isEmpty()) {
+			log.warn("Did not find {} Files", broken.size());
+			broken.forEach(desc -> log.warn("\tDid not find file for {}", desc));
 		}
-		else {
-			for (String tag : tags) {
-				for (File desc : descriptionFiles) {
-					final List<PreprocessingJob> jobDescriptions =
-							findPreprocessingDescriptions(desc, inDir, outDir, Optional.of(tag), environment.getValidator());
 
-					jobs.addAll(jobDescriptions);
-				}
-			}
+		if (isFailed()) {
+			log.error("Failed {} Preprocessing Jobs:", failed.size());
+			failed.forEach(desc -> log.error("\tFailed Preprocessing for {}", desc));
+			doFail();
 		}
+	}
 
-		List<PreprocessingJob> broken = new ArrayList<>();
+	@NotNull
+	private List<PreprocessingJob> validateJobs(Collection<PreprocessingJob> jobs, Environment environment) {
+		final List<PreprocessingJob> broken = new ArrayList<>();
 
-		for (Iterator<PreprocessingJob> iterator = jobs.iterator(); iterator.hasNext(); ) {
+		for (final Iterator<PreprocessingJob> iterator = jobs.iterator(); iterator.hasNext(); ) {
 			final PreprocessingJob job = iterator.next();
 
 			try {
@@ -213,22 +226,48 @@ protected void run(Environment environment, Namespace namespace, ConqueryConfig
 			log.error("FAILED Preprocessing, files are missing or invalid.");
 			doFail();
 		}
+		return broken;
+	}
 
-		jobs.removeIf(Predicate.not(PreprocessorCommand::requiresProcessing));
+	@NotNull
+	private Collection<PreprocessingJob> collectJobs(List<File> descriptionFiles, List<String> tags, File inDir, File outDir, Environment environment)
+			throws IOException {
+		final Collection<PreprocessingJob> jobs = new ArrayList<>();
 
+		if (tags == null || tags.isEmpty()) {
+			for (File desc : descriptionFiles) {
+				final List<PreprocessingJob> descriptions =
+						findPreprocessingDescriptions(desc, inDir, outDir, Optional.empty(), environment.getValidator());
+				jobs.addAll(descriptions);
+			}
+		}
+		else {
+			for (String tag : tags) {
+				for (File desc : descriptionFiles) {
+					final List<PreprocessingJob> jobDescriptions =
+							findPreprocessingDescriptions(desc, inDir, outDir, Optional.of(tag), environment.getValidator());
+
+					jobs.addAll(jobDescriptions);
+				}
+			}
+		}
+		return jobs;
+	}
+
+	private void preprocessJobs(Collection<PreprocessingJob> jobs, int buckets, ConqueryConfig config) throws InterruptedException {
 		final long totalSize = jobs.stream()
 								   .mapToLong(PreprocessingJob::estimateTotalCsvSizeBytes)
 								   .sum();
 
 		log.info("Required to preprocess {} in total", BinaryByteUnit.format(totalSize));
 
-		ProgressBar totalProgress = new ProgressBar(totalSize, System.out);
+		final ProgressBar totalProgress = new ProgressBar(totalSize, System.out);
 
 		for (PreprocessingJob job : jobs) {
 			pool.submit(() -> {
 				ConqueryMDC.setLocation(job.toString());
 				try {
-					Preprocessor.preprocess(job, totalProgress, config);
+					Preprocessor.preprocess(job, totalProgress, config, buckets);
 					success.add(job.toString());
 				}
 				catch (FileNotFoundException e) {
@@ -246,23 +285,6 @@ protected void run(Environment environment, Namespace namespace, ConqueryConfig
 		pool.awaitTermination(24, TimeUnit.HOURS);
 
 		ConqueryMDC.clearLocation();
-
-
-		if (!success.isEmpty()) {
-			log.info("Successfully Preprocess {} Jobs:", success.size());
-			success.forEach(desc -> log.info("\tSucceeded Preprocessing for {}", desc));
-		}
-
-		if (!broken.isEmpty()) {
-			log.warn("Did not find {} Files", broken.size());
-			broken.forEach(desc -> log.warn("\tDid not find file for {}", desc));
-		}
-
-		if (isFailed()) {
-			log.error("Failed {} Preprocessing Jobs:", failed.size());
-			failed.forEach(desc -> log.error("\tFailed Preprocessing for {}", desc));
-			doFail();
-		}
 	}
 
 	private void addMissing(PreprocessingJob job) {
@@ -281,7 +303,7 @@ private void addFailed(PreprocessingJob job) {
 
 	public List<PreprocessingJob> findPreprocessingDescriptions(File descriptionFiles, File inDir, File outputDir, Optional<String> tag, Validator validator)
 			throws IOException {
-		List<PreprocessingJob> out = new ArrayList<>();
+		final List<PreprocessingJob> out = new ArrayList<>();
 
 		final File[] files = descriptionFiles.isFile()
 							 ? new File[]{descriptionFiles}
@@ -302,8 +324,7 @@ private boolean isFailed() {
 		return !failed.isEmpty();
 	}
 
-	private Optional<PreprocessingJob> tryExtractDescriptor(Validator validator, Optional<String> tag, File descriptionFile, File outputDir, File csvDir)
-			throws IOException {
+	private Optional<PreprocessingJob> tryExtractDescriptor(Validator validator, Optional<String> tag, File descriptionFile, File outputDir, File csvDir) {
 		try {
 			final TableImportDescriptor
 					descriptor =

diff --git a/backend/src/main/java/com/bakdata/conquery/io/storage/NamespaceStorage.java b/backend/src/main/java/com/bakdata/conquery/io/storage/NamespaceStorage.java
@@ -4,8 +4,6 @@
 import java.util.Objects;
 import java.util.OptionalInt;
 
-import jakarta.validation.Validator;
-
 import com.bakdata.conquery.io.storage.xodus.stores.CachedStore;
 import com.bakdata.conquery.io.storage.xodus.stores.SingletonStore;
 import com.bakdata.conquery.models.config.StoreFactory;
@@ -19,6 +17,7 @@
 import com.bakdata.conquery.models.worker.WorkerToBucketsMap;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.collect.ImmutableList;
+import jakarta.validation.Validator;
 import lombok.extern.slf4j.Slf4j;
 
 @Slf4j
@@ -121,12 +120,8 @@ public OptionalInt getEntityBucket(String entity) {
 		return OptionalInt.of(bucket);
 	}
 
-	public int assignEntityBucket(String entity, int bucketSize) {
-		final int bucket = (int) Math.ceil((1d + getNumberOfEntities()) / (double) bucketSize);
-
-		entity2Bucket.add(entity, bucket);
-
-		return bucket;
+	public void assignEntityBucket(String entity, int bucket) {
-	public void assignEntityBucket(String entity, int bucket) {
+	public void assignEntityToBucket(String entity, int bucket) {
+
-	public void assignEntityBucket(String entity, int bucket) {
+	public void assignEntityToBucket(String entity, int bucket) {
+
+		entity2Bucket.update(entity, bucket);
 	}
 
 

diff --git a/backend/src/main/java/com/bakdata/conquery/io/storage/xodus/stores/CachedStore.java b/backend/src/main/java/com/bakdata/conquery/io/storage/xodus/stores/CachedStore.java
@@ -33,8 +33,7 @@ public void add(KEY key, VALUE value) {
 
 	@Override
 	public VALUE get(KEY key) {
-		// TODO: 08.01.2020 fk: This assumes that all values have been read at some point!
-		return cache.get(key);
+		return cache.computeIfAbsent(key, store::get);
 	}
 
 	@Override

diff --git a/backend/src/main/java/com/bakdata/conquery/mode/cluster/ClusterImportHandler.java b/backend/src/main/java/com/bakdata/conquery/mode/cluster/ClusterImportHandler.java
@@ -31,17 +31,15 @@ class ClusterImportHandler implements ImportHandler {
 	@SneakyThrows
 	@Override
 	public void updateImport(Namespace namespace, InputStream inputStream) {
-		ImportJob job = ImportJob.createOrUpdate(
+		final Table table = ImportJob.createOrUpdate(
-		final Table table = ImportJob.createOrUpdate(
+		final Table table = ImportJob.createAndQueue(
-		final Table table = ImportJob.createOrUpdate(
+		final Table table = ImportJob.createAndQueue(
 				datasetRegistry.get(namespace.getDataset().getId()),
 				inputStream,
 				config.getCluster().getEntityBucketSize(),
 				config,
 				true
 		);
 
-		namespace.getJobManager().addSlowJob(job);
-
-		clearDependentConcepts(namespace.getStorage().getAllConcepts(), job.getTable());
+		clearDependentConcepts(namespace.getStorage().getAllConcepts(), table);
 	}
 
 	private void clearDependentConcepts(Collection<Concept<?>> allConcepts, Table table) {
@@ -59,22 +57,21 @@ private void clearDependentConcepts(Collection<Concept<?>> allConcepts, Table ta
 	@SneakyThrows
 	@Override
 	public void addImport(Namespace namespace, InputStream inputStream) {
-		ImportJob job = ImportJob.createOrUpdate(
+		final Table table = ImportJob.createOrUpdate(
 				datasetRegistry.get(namespace.getDataset().getId()),
 				inputStream,
 				config.getCluster().getEntityBucketSize(),
 				config,
 				false
 		);
-		namespace.getJobManager().addSlowJob(job);
 
-		clearDependentConcepts(namespace.getStorage().getAllConcepts(), job.getTable());
+		clearDependentConcepts(namespace.getStorage().getAllConcepts(), table);
 	}
 
 	@Override
 	public void deleteImport(Import imp) {
 
-		DatasetId id = imp.getTable().getDataset().getId();
+		final DatasetId id = imp.getTable().getDataset().getId();
 		final DistributedNamespace namespace = datasetRegistry.get(id);
 
 		clearDependentConcepts(namespace.getStorage().getAllConcepts(), imp.getTable());

diff --git a/backend/src/main/java/com/bakdata/conquery/models/events/Bucket.java b/backend/src/main/java/com/bakdata/conquery/models/events/Bucket.java
@@ -50,16 +50,15 @@
 @FieldNameConstants
 @Getter
 @Setter
-@ToString(of = {"numberOfEvents", "stores"}, callSuper = true)
+@ToString(onlyExplicitlyIncluded = true, callSuper = true)
 @AllArgsConstructor
 @RequiredArgsConstructor(onConstructor_ = {@JsonCreator}, access = AccessLevel.PROTECTED)
 public class Bucket extends IdentifiableImpl<BucketId> implements NamespacedIdentifiable<BucketId> {
 
 	@Min(0)
 	private final int bucket;
 
-	@Min(0)
-	private final int numberOfEvents;
+	@ToString.Include
 	@JsonManagedReference
 	@Setter(AccessLevel.PROTECTED)
 	private ColumnStore[] stores;
@@ -78,6 +77,12 @@ public class Bucket extends IdentifiableImpl<BucketId> implements NamespacedIden
 	private final Import imp;
 
 
+	@JsonIgnore
+	@ToString.Include
+	public int getNumberOfEvents(){
+		return ends.values().intStream().max().orElse(0);
+	}
+
 	@JsonIgnore
 	@ValidationMethod(message = "Number of events does not match the length of some stores.")
 	public boolean isNumberOfEventsEqualsNumberOfStores() {

diff --git a/backend/src/main/java/com/bakdata/conquery/models/events/EmptyBucket.java b/backend/src/main/java/com/bakdata/conquery/models/events/EmptyBucket.java
@@ -20,7 +20,7 @@ public class EmptyBucket extends Bucket {
 	private static final EmptyBucket Instance = new EmptyBucket();
 
 	public EmptyBucket() {
-		super(0, 0, Object2IntMaps.emptyMap(), Object2IntMaps.emptyMap(), null);
+		super(0, Object2IntMaps.emptyMap(), Object2IntMaps.emptyMap(), null);
 		this.setStores(new ColumnStore[0]);
 	}
 

diff --git a/backend/src/main/java/com/bakdata/conquery/models/identifiable/InjectingCentralRegistry.java b/backend/src/main/java/com/bakdata/conquery/models/identifiable/InjectingCentralRegistry.java