From b15d58e1383e5b75b23f2bbefdd8ed8e6a368314 Mon Sep 17 00:00:00 2001 From: Arvid Heise Date: Sat, 23 Feb 2019 13:31:16 +0100 Subject: [PATCH] General cleanup --- LICENSE | 2 +- build.gradle.kts | 21 +- common/build.gradle.kts | 2 +- .../candidate_selection/CompositeValue.java | 5 +- .../candidate_selection/SortingKey.java | 5 +- .../OnlineSortedNeighborhoodMethod.java | 21 +- .../classifier/OracleClassifier.java | 36 ++- .../classifier/RuleBasedClassifier.java | 86 ++++--- .../clustering/ConsistentClustering.java | 25 +- .../clustering/OracleClustering.java | 39 ++- .../clustering/RefineCluster.java | 104 ++++---- .../clustering/RefinedTransitiveClosure.java | 14 +- .../clustering/TransitiveClosure.java | 13 +- .../deduplication/fusion/AnnotatedValue.java | 5 +- .../fusion/CommonConflictResolutions.java | 42 ++-- .../fusion/ConflictResolution.java | 11 +- .../fusion/ConflictResolutionFusion.java | 18 +- .../fusion/ConflictResolutions.java | 199 +-------------- .../deduplication/fusion/FusionContext.java | 5 +- .../bakdata/deduplication/fusion/Merge.java | 228 ++++++++++++++++++ .../deduplication/fusion/ResolutionPath.java | 14 +- .../deduplication/fusion/ResolutionTag.java | 5 +- .../bakdata/deduplication/fusion/Source.java | 5 +- .../fusion/TerminalConflictResolution.java | 8 +- .../similarity/CommonSimilarityMeasures.java | 115 ++------- .../similarity/DistanceSimilarityMeasure.java | 45 ++++ .../deduplication/similarity/Levensthein.java | 51 ++++ .../deduplication/similarity/MongeElkan.java | 71 ++++++ .../java/com/bakdata/util/ObjectUtils.java | 5 +- .../fusion/ConflictResolutionsTest.java | 15 +- core/build.gradle.kts | 2 +- .../deduplication/ExceptionContext.java | 5 +- .../candidate_selection/Candidate.java | 7 +- .../offline/OfflineCandidateSelection.java | 6 +- .../online/OnlineCandidateSelection.java | 6 +- .../classifier/Classification.java | 5 +- .../classifier/ClassifiedCandidate.java | 7 +- .../deduplication/classifier/Classifier.java | 7 +- .../deduplication/clustering/Cluster.java | 14 +- .../clustering/ClusterSplitHandler.java | 5 +- .../deduplication/clustering/Clustering.java | 6 +- .../deduplication/HardFusionHandler.java | 6 +- .../online/OnlineDeduplication.java | 5 +- .../online/OnlinePairBasedDeduplication.java | 11 +- .../duplicate_detection/HardPairHandler.java | 6 +- .../online/OnlineDuplicateDetection.java | 6 +- .../OnlinePairBasedDuplicateDetection.java | 10 +- .../deduplication/fusion/FusedValue.java | 8 +- .../bakdata/deduplication/fusion/Fusion.java | 5 +- .../deduplication/fusion/FusionException.java | 7 +- .../similarity/CutoffSimiliarityMeasure.java | 50 ++++ .../similarity/SimilarityContext.java | 5 +- .../similarity/SimilarityException.java | 7 +- .../similarity/SimilarityMeasure.java | 30 +-- .../similarity/SimilarityPath.java | 5 +- .../similarity/SimilarityTransformation.java | 7 +- .../com/bakdata/util/FunctionalClass.java | 14 +- .../bakdata/util/FunctionalConstructor.java | 10 +- .../com/bakdata/util/FunctionalMethod.java | 10 +- .../com/bakdata/util/FunctionalClassTest.java | 26 +- examples/build.gradle.kts | 2 +- .../bakdata/deduplication/person/Gender.java | 5 +- .../bakdata/deduplication/person/Person.java | 14 +- .../person/PersonCandidateSelection.java | 7 +- .../person/PersonClassifier.java | 25 +- .../person/PersonClustering.java | 11 +- .../person/PersonDeduplication.java | 9 +- .../person/PersonDuplicateDetection.java | 17 +- .../deduplication/person/PersonFusion.java | 18 +- .../person/PersonDeduplicationTest.java | 14 +- settings.gradle | 2 +- 71 files changed, 917 insertions(+), 720 deletions(-) create mode 100644 common/src/main/java/com/bakdata/deduplication/fusion/Merge.java create mode 100644 common/src/main/java/com/bakdata/deduplication/similarity/DistanceSimilarityMeasure.java create mode 100644 common/src/main/java/com/bakdata/deduplication/similarity/Levensthein.java create mode 100644 common/src/main/java/com/bakdata/deduplication/similarity/MongeElkan.java create mode 100644 core/src/main/java/com/bakdata/deduplication/similarity/CutoffSimiliarityMeasure.java diff --git a/LICENSE b/LICENSE index d15cdac..84efbb0 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2018 bakdata +Copyright (c) 2019 bakdata Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/build.gradle.kts b/build.gradle.kts index 28a39b8..2f86616 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -1,9 +1,8 @@ plugins { - `java-library` // release id("net.researchgate.release") version "2.6.0" - id("com.bakdata.sonar") version "1.0.1" - id("com.bakdata.sonatype") version "1.0.1" + id("com.bakdata.sonar") version "1.1.2" + id("com.bakdata.sonatype") version "1.1.2" id("org.hildan.github.changelog") version "0.8.0" } @@ -46,13 +45,13 @@ subprojects { } dependencies { - testImplementation("org.junit.jupiter:junit-jupiter-api:5.3.0") - testRuntimeOnly("org.junit.jupiter:junit-jupiter-engine:5.3.0") - testImplementation(group= "org.assertj", name= "assertj-core", version= "3.11.1") - - compileOnly("org.projectlombok:lombok:1.18.4") - annotationProcessor("org.projectlombok:lombok:1.18.4") - testCompileOnly("org.projectlombok:lombok:1.18.4") - testAnnotationProcessor("org.projectlombok:lombok:1.18.4") + "testImplementation"("org.junit.jupiter:junit-jupiter-api:5.3.0") + "testRuntimeOnly"("org.junit.jupiter:junit-jupiter-engine:5.3.0") + "testImplementation"(group = "org.assertj", name = "assertj-core", version = "3.11.1") + + "compileOnly"("org.projectlombok:lombok:1.18.6") + "annotationProcessor"("org.projectlombok:lombok:1.18.6") + "testCompileOnly"("org.projectlombok:lombok:1.18.6") + "testAnnotationProcessor"("org.projectlombok:lombok:1.18.6") } } diff --git a/common/build.gradle.kts b/common/build.gradle.kts index 29b8008..1a8fcd4 100644 --- a/common/build.gradle.kts +++ b/common/build.gradle.kts @@ -6,4 +6,4 @@ dependencies { "api"(group = "commons-codec", name = "commons-codec", version = "1.11") implementation(group = "com.google.guava", name = "guava", version = "26.0-jre") -} \ No newline at end of file +} diff --git a/common/src/main/java/com/bakdata/deduplication/candidate_selection/CompositeValue.java b/common/src/main/java/com/bakdata/deduplication/candidate_selection/CompositeValue.java index ab0685a..8ff8c95 100644 --- a/common/src/main/java/com/bakdata/deduplication/candidate_selection/CompositeValue.java +++ b/common/src/main/java/com/bakdata/deduplication/candidate_selection/CompositeValue.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.candidate_selection; diff --git a/common/src/main/java/com/bakdata/deduplication/candidate_selection/SortingKey.java b/common/src/main/java/com/bakdata/deduplication/candidate_selection/SortingKey.java index 1b6ef1e..72f2b93 100644 --- a/common/src/main/java/com/bakdata/deduplication/candidate_selection/SortingKey.java +++ b/common/src/main/java/com/bakdata/deduplication/candidate_selection/SortingKey.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.candidate_selection; diff --git a/common/src/main/java/com/bakdata/deduplication/candidate_selection/online/OnlineSortedNeighborhoodMethod.java b/common/src/main/java/com/bakdata/deduplication/candidate_selection/online/OnlineSortedNeighborhoodMethod.java index ed31ef5..e469889 100644 --- a/common/src/main/java/com/bakdata/deduplication/candidate_selection/online/OnlineSortedNeighborhoodMethod.java +++ b/common/src/main/java/com/bakdata/deduplication/candidate_selection/online/OnlineSortedNeighborhoodMethod.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.candidate_selection.online; @@ -46,7 +45,8 @@ public class OnlineSortedNeighborhoodMethod implements OnlineCandidateSelecti int defaultWindowSize = 10; public List> getCandidates(final T newRecord) { - return this.passes.stream().flatMap(pass -> pass.getCandidates(newRecord).stream()).distinct().collect(Collectors.toList()); + return this.passes.stream().flatMap(pass -> pass.getCandidates(newRecord).stream()).distinct() + .collect(Collectors.toList()); } @Value @@ -58,14 +58,14 @@ public static class Pass { List> getCandidates(final T newRecord) { final Comparable newKey = this.sortingKey.getKeyExtractor().apply(newRecord); - if(newKey == null) { + if (newKey == null) { return List.of(); } final Stream largerRecords = this.index.tailMap(newKey).values().stream().flatMap(List::stream).limit( - this.windowSize / 2); + this.windowSize / 2); final Stream smallerRecords = - this.index.descendingMap().tailMap(newKey).values().stream().flatMap(List::stream).limit( - this.windowSize / 2); + this.index.descendingMap().tailMap(newKey).values().stream().flatMap(List::stream).limit( + this.windowSize / 2); final List> candidates = Stream.concat(smallerRecords, largerRecords) .map(oldRecord -> new Candidate<>(newRecord, oldRecord)) .collect(Collectors.toList()); @@ -77,7 +77,8 @@ List> getCandidates(final T newRecord) { @SuppressWarnings({"WeakerAccess", "unused"}) public static class OnlineSortedNeighborhoodMethodBuilder { - public OnlineSortedNeighborhoodMethodBuilder sortingKey(final SortingKey sortingKey, final int windowSize) { + public OnlineSortedNeighborhoodMethodBuilder sortingKey(final SortingKey sortingKey, + final int windowSize) { return this.pass(new Pass<>(sortingKey, windowSize)); } @@ -90,7 +91,7 @@ public OnlineSortedNeighborhoodMethodBuilder sortingKeys(final Collection sortingKeys(final Iterable> sortingKeys, - final int windowSize) { + final int windowSize) { for (final SortingKey sortingKey : sortingKeys) { this.sortingKey(sortingKey, windowSize); } diff --git a/common/src/main/java/com/bakdata/deduplication/classifier/OracleClassifier.java b/common/src/main/java/com/bakdata/deduplication/classifier/OracleClassifier.java index 41525e8..1c8146b 100644 --- a/common/src/main/java/com/bakdata/deduplication/classifier/OracleClassifier.java +++ b/common/src/main/java/com/bakdata/deduplication/classifier/OracleClassifier.java @@ -1,3 +1,27 @@ +/* + * MIT License + * + * Copyright (c) 2019 bakdata GmbH + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + package com.bakdata.deduplication.classifier; import com.bakdata.deduplication.candidate_selection.Candidate; @@ -11,9 +35,9 @@ @Value public class OracleClassifier implements Classifier { private static final Classification DUPLICATE = - Classification.builder().result(Classification.ClassificationResult.DUPLICATE).confidence(1).build(); + Classification.builder().result(Classification.ClassificationResult.DUPLICATE).confidence(1).build(); private static final Classification NON_DUPLICATE = - Classification.builder().result(Classification.ClassificationResult.NON_DUPLICATE).confidence(1).build(); + Classification.builder().result(Classification.ClassificationResult.NON_DUPLICATE).confidence(1).build(); @NonNull Set> goldDuplicates; @@ -22,13 +46,13 @@ public class OracleClassifier implements Classifier { private Set> calculateSymmetricDuplicates() { return this.getGoldDuplicates().stream() - .flatMap(duplicate -> - Stream.of(duplicate, new Candidate<>(duplicate.getOldRecord(), duplicate.getNewRecord()))) - .collect(Collectors.toSet()); + .flatMap(duplicate -> + Stream.of(duplicate, new Candidate<>(duplicate.getOldRecord(), duplicate.getNewRecord()))) + .collect(Collectors.toSet()); } @Override public Classification classify(final Candidate candidate) { return this.getSymmetricDuplicates().contains(candidate) ? DUPLICATE : NON_DUPLICATE; } -} \ No newline at end of file +} diff --git a/common/src/main/java/com/bakdata/deduplication/classifier/RuleBasedClassifier.java b/common/src/main/java/com/bakdata/deduplication/classifier/RuleBasedClassifier.java index d5c99cd..ac181db 100644 --- a/common/src/main/java/com/bakdata/deduplication/classifier/RuleBasedClassifier.java +++ b/common/src/main/java/com/bakdata/deduplication/classifier/RuleBasedClassifier.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.classifier; @@ -36,16 +35,21 @@ import lombok.Value; /** - * Successively applies a list of rules to the record and returns the respective {@link Classification} with the following cases: + * Successively applies a list of rules to the record and returns the respective {@link Classification} with the + * following cases: *
    - *
  • If any rule classifies the pair unambiguously as {@link Classification.ClassificationResult#DUPLICATE} or {@link Classification.ClassificationResult#NON_DUPLICATE}, the classification is immediately returned.
  • - *
  • If some rule classifies the pair as {@link Classification.ClassificationResult#POSSIBLE_DUPLICATE}, the remaining rules with be evaluated to see if an unambiguous classification will be reached, in which case that classification is returned. If the results are only ambiguous, the last {@code POSSIBLE_DUPLICATE} classification will be returned.
  • + *
  • If any rule classifies the pair unambiguously as {@link Classification.ClassificationResult#DUPLICATE} or {@link + * Classification.ClassificationResult#NON_DUPLICATE}, the classification is immediately returned.
  • + *
  • If some rule classifies the pair as {@link Classification.ClassificationResult#POSSIBLE_DUPLICATE}, the + * remaining + * rules with be evaluated to see if an unambiguous classification will be reached, in which case that classification is + * returned. If the results are only ambiguous, the last {@code POSSIBLE_DUPLICATE} classification will be + * returned.
  • *
  • If no rule can be applied, the result is {@link #UNKNOWN}.
  • *
*
- * The {@code Classification} will contain a description naming the triggered rule and converts the rule score into a confidence score. - * - * @param + * The {@code Classification} will contain a description naming the triggered rule and converts the rule score into a + * confidence score. */ @Value @Builder @@ -84,50 +88,56 @@ private SimilarityException createException(final Candidate candidate, final } private Optional evaluateRule(final Rule rule, final Candidate candidate, - final SimilarityContext context) { - return context.safeExecute(() -> rule.evaluate(candidate.getNewRecord(), candidate.getOldRecord(), context)).map(score -> { - if (Float.isNaN(score)) { - return UNKNOWN; - } - if (score <= -0.0f) { - return Classification.builder() - .result(Classification.ClassificationResult.NON_DUPLICATE) - .confidence(-score) - .explanation(rule.getName()) - .build(); - } else { - return Classification.builder() - .result(Classification.ClassificationResult.DUPLICATE) - .confidence(score) - .explanation(rule.getName()) - .build(); - } - }); + final SimilarityContext context) { + return context.safeExecute(() -> rule.evaluate(candidate.getNewRecord(), candidate.getOldRecord(), context)) + .map(score -> { + if (Float.isNaN(score)) { + return UNKNOWN; + } + if (score <= -0.0f) { + return Classification.builder() + .result(Classification.ClassificationResult.NON_DUPLICATE) + .confidence(-score) + .explanation(rule.getName()) + .build(); + } else { + return Classification.builder() + .result(Classification.ClassificationResult.DUPLICATE) + .confidence(score) + .explanation(rule.getName()) + .build(); + } + }); } @SuppressWarnings({"WeakerAccess", "UnusedReturnValue"}) public static class RuleBasedClassifierBuilder { - public RuleBasedClassifierBuilder positiveRule(final String name, final BiPredicate applicablePredicate, - final SimilarityMeasure similarityMeasure) { + public RuleBasedClassifierBuilder positiveRule(final String name, + final BiPredicate applicablePredicate, + final SimilarityMeasure similarityMeasure) { return this.positiveRule(name, (left, right, context) -> - applicablePredicate.test(left, right) ? similarityMeasure.getSimilarity(left, right, context) : DOES_NOT_APPLY); + applicablePredicate.test(left, right) ? similarityMeasure.getSimilarity(left, right, context) + : DOES_NOT_APPLY); } - public RuleBasedClassifierBuilder positiveRule(final String name, final SimilarityMeasure similarityMeasure) { + public RuleBasedClassifierBuilder positiveRule(final String name, + final SimilarityMeasure similarityMeasure) { return this.rule(new Rule<>(name, similarityMeasure.unknownIf(s -> s <= 0))); } - public RuleBasedClassifierBuilder negativeRule(final String name, final BiPredicate applicablePredicate, - final SimilarityMeasure similarityMeasure) { + public RuleBasedClassifierBuilder negativeRule(final String name, + final BiPredicate applicablePredicate, + final SimilarityMeasure similarityMeasure) { return this.negativeRule(name, (left, right, context) -> - applicablePredicate.test(left, right) ? similarityMeasure.getSimilarity(left, right, context) : DOES_NOT_APPLY); + applicablePredicate.test(left, right) ? similarityMeasure.getSimilarity(left, right, context) + : DOES_NOT_APPLY); } public RuleBasedClassifierBuilder negativeRule(final String name, - final SimilarityMeasure similarityMeasure) { + final SimilarityMeasure similarityMeasure) { final SimilarityMeasure negativeSim = - (left, right, context) -> -similarityMeasure.getSimilarity(left, right, context); + (left, right, context) -> -similarityMeasure.getSimilarity(left, right, context); return this.rule(new Rule<>(name, negativeSim.unknownIf(s -> s >= 0))); } @@ -150,4 +160,4 @@ float evaluate(final T left, final T right, final SimilarityContext context) { return this.measure.getSimilarity(left, right, context); } } -} \ No newline at end of file +} diff --git a/common/src/main/java/com/bakdata/deduplication/clustering/ConsistentClustering.java b/common/src/main/java/com/bakdata/deduplication/clustering/ConsistentClustering.java index af6a705..df856e3 100644 --- a/common/src/main/java/com/bakdata/deduplication/clustering/ConsistentClustering.java +++ b/common/src/main/java/com/bakdata/deduplication/clustering/ConsistentClustering.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.clustering; @@ -38,21 +37,23 @@ import lombok.Value; /** - * Wraps another clustering and keeps clusters together, when the wrapped clustering would split it.
- * Example: consider a stable marriage-based clustering where A1-B have been previously matched and subsequently clustered. - * If a strong A2-B would replace that pair and thus split the cluster, this consistent clustering returns a cluster [A1, A2, B] instead.
+ * Wraps another clustering and keeps clusters together, when the wrapped clustering would split it.
Example: + * consider a stable marriage-based clustering where A1-B have been previously matched and subsequently clustered. If a + * strong A2-B would replace that pair and thus split the cluster, this consistent clustering returns a cluster [A1, A2, + * B] instead.
*

- * This clustering is similar to {@link TransitiveClosure} but allows the wrapped clustering to split temporary (=not-returned) clusters. Thus, in the example above, we have the following two situations: - * - If A1-B and A2-B would be passed in the same invocation of {@link #cluster(List)}, only cluster [A2, B] would be returned. - * - If A-B is passed in a first invocation, this invocation returns [A1, B]. The following invocation with A2-B would then return [A1, A2, B]. + * This clustering is similar to {@link TransitiveClosure} but allows the wrapped clustering to split temporary + * (=not-returned) clusters. Thus, in the example above, we have the following two situations: - If A1-B and A2-B would + * be passed in the same invocation of {@link #cluster(List)}, only cluster [A2, B] would be returned. - If A-B is + * passed in a first invocation, this invocation returns [A1, B]. The following invocation with A2-B would then return + * [A1, A2, B]. *

* It thus trades off clustering accuracy to increase reliability of subsequent data processing. - * - * @param */ @Value @Builder -public class ConsistentClustering, T, I extends Comparable> implements Clustering { +public class ConsistentClustering, T, I extends Comparable> + implements Clustering { @NonNull Clustering clustering; Function idExtractor; diff --git a/common/src/main/java/com/bakdata/deduplication/clustering/OracleClustering.java b/common/src/main/java/com/bakdata/deduplication/clustering/OracleClustering.java index 28a4f57..d2affd2 100644 --- a/common/src/main/java/com/bakdata/deduplication/clustering/OracleClustering.java +++ b/common/src/main/java/com/bakdata/deduplication/clustering/OracleClustering.java @@ -1,3 +1,27 @@ +/* + * MIT License + * + * Copyright (c) 2019 bakdata GmbH + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + package com.bakdata.deduplication.clustering; import com.bakdata.deduplication.classifier.Classification; @@ -15,25 +39,28 @@ @Value public class OracleClustering, T, I> implements Clustering { - private static final Classification DUPLICATE = Classification.builder().result(Classification.ClassificationResult.DUPLICATE).confidence(1).build(); - private static final Classification NON_DUPLICATE = Classification.builder().result(Classification.ClassificationResult.NON_DUPLICATE).confidence(1).build(); + private static final Classification DUPLICATE = + Classification.builder().result(Classification.ClassificationResult.DUPLICATE).confidence(1).build(); + private static final Classification NON_DUPLICATE = + Classification.builder().result(Classification.ClassificationResult.NON_DUPLICATE).confidence(1).build(); Collection> goldClusters; @NonNull Function idExtractor; @Getter(lazy = true) Map> idToCluster = this.goldClusters.stream() .flatMap(cluster -> - cluster.getElements().stream() - .map(e -> new AbstractMap.SimpleEntry<>(this.idExtractor.apply(e), cluster))) + cluster.getElements().stream() + .map(e -> new AbstractMap.SimpleEntry<>(this.idExtractor.apply(e), cluster))) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); @Override public List> cluster(final List> classified) { return classified.stream() - .map(candidate -> this.getIdToCluster().get(this.idExtractor.apply(candidate.getCandidate().getOldRecord()))) + .map(candidate -> this.getIdToCluster() + .get(this.idExtractor.apply(candidate.getCandidate().getOldRecord()))) .filter(Objects::nonNull) .distinct() .collect(Collectors.toList()); } -} \ No newline at end of file +} diff --git a/common/src/main/java/com/bakdata/deduplication/clustering/RefineCluster.java b/common/src/main/java/com/bakdata/deduplication/clustering/RefineCluster.java index 11f884b..7cb09fd 100644 --- a/common/src/main/java/com/bakdata/deduplication/clustering/RefineCluster.java +++ b/common/src/main/java/com/bakdata/deduplication/clustering/RefineCluster.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.clustering; @@ -98,43 +97,62 @@ private static float scoreClustering(final byte[] partitions, final float[][] we score += weightMatrix[rowIndex][colIndex] / partitionSizes[partitions[rowIndex]]; } else { score -= weightMatrix[rowIndex][colIndex] / (n - partitionSizes[partitions[rowIndex]]) + - weightMatrix[rowIndex][colIndex] / (n - partitionSizes[partitions[colIndex]]); + weightMatrix[rowIndex][colIndex] / (n - partitionSizes[partitions[colIndex]]); } } } return score; } - private List> getRelevantClassifications(final Cluster cluster, - final Map>> relevantClassificationIndex) { + private static List getRandomEdges(final int potentialNumEdges, final int desiredNumEdges) { + final List weightedEdges; + weightedEdges = new Random().ints(0, potentialNumEdges) + .distinct() + .limit(desiredNumEdges) + .mapToObj(i -> { + // reverse of Gaussian + int leftIndex = (int) (Math.sqrt(i + 0.25) - 0.5); + int rightIndex = i - getNumEdges(leftIndex) + leftIndex; + return WeightedEdge.of(leftIndex, rightIndex, Float.NaN); + }) + .collect(Collectors.toList()); + return weightedEdges; + } + + private List> getRelevantClassifications(final Cluster cluster, + final Map>> relevantClassificationIndex) { return cluster.getElements().stream() - .flatMap(record -> relevantClassificationIndex.getOrDefault(record, List.of()).stream() - .filter(classifiedCandidate -> cluster.contains(classifiedCandidate.getCandidate().getOldRecord()))) - .collect(Collectors.toList()); + .flatMap(record -> relevantClassificationIndex.getOrDefault(record, List.of()).stream() + .filter(classifiedCandidate -> cluster + .contains(classifiedCandidate.getCandidate().getOldRecord()))) + .collect(Collectors.toList()); } public List> refine(final Collection> transitiveClosure, - final Iterable> knownClassifications) { + final Iterable> knownClassifications) { final Map>> relevantClassificationIndex = - this.getRelevantClassificationIndex(knownClassifications); + this.getRelevantClassificationIndex(knownClassifications); return transitiveClosure.stream() - .flatMap(cluster -> this.refineCluster(cluster, - this.getRelevantClassifications(cluster, relevantClassificationIndex))) - .collect(Collectors.toList()); + .flatMap(cluster -> this.refineCluster(cluster, + this.getRelevantClassifications(cluster, relevantClassificationIndex))) + .collect(Collectors.toList()); } - private Map>> getRelevantClassificationIndex(final Iterable> knownClassifications) { + private Map>> getRelevantClassificationIndex( + final Iterable> knownClassifications) { final Map>> relevantClassifications = new HashMap<>(); for (final ClassifiedCandidate knownClassification : knownClassifications) { final Candidate candidate = knownClassification.getCandidate(); - relevantClassifications.computeIfAbsent(candidate.getNewRecord(), r -> new LinkedList<>()).add(knownClassification); + relevantClassifications.computeIfAbsent(candidate.getNewRecord(), r -> new LinkedList<>()) + .add(knownClassification); } return relevantClassifications; } - private byte[] refineBigCluster(final Cluster cluster, final List> knownClassifications) { + private byte[] refineBigCluster(final Cluster cluster, + final List> knownClassifications) { final List duplicates = this.toWeightedEdges(knownClassifications, cluster); - final int desiredNumEdges = RefineCluster.getNumEdges(this.maxSmallClusterSize); + final int desiredNumEdges = getNumEdges(this.maxSmallClusterSize); return this.greedyCluster(cluster, this.getWeightedEdges(cluster, duplicates, desiredNumEdges)); } @@ -147,7 +165,8 @@ private byte[] refineBigCluster(final Cluster cluster, final List cluster, final List> knownClassifications) { + private byte[] refineSmallCluster(final Cluster cluster, + final List> knownClassifications) { final float[][] weightMatrix = this.getKnownWeightMatrix(cluster, knownClassifications); final int n = cluster.size(); @@ -155,21 +174,22 @@ private byte[] refineSmallCluster(final Cluster cluster, final List(cluster.get(rowIndex), cluster.get(colIndex)))); + getWeight(this.classifier + .classify(new Candidate<>(cluster.get(rowIndex), cluster.get(colIndex)))); } } } return StreamSupport.stream(Spliterators.spliteratorUnknownSize(new ClusteringGenerator((byte) n), 0), false) .map(clustering -> new AbstractMap.SimpleEntry<>(clustering.clone(), - RefineCluster.scoreClustering(clustering, weightMatrix))) + scoreClustering(clustering, weightMatrix))) .max(Comparator.comparingDouble(Map.Entry::getValue)) .map(Map.Entry::getKey) .orElseThrow(() -> new IllegalStateException("Non-empty clusters should have one valid clustering")); } private List toWeightedEdges(final Collection> knownClassifications, - final Cluster cluster) { + final Cluster cluster) { final Map clusterIndex = IntStream.range(0, cluster.size()).boxed().collect(Collectors.toMap(cluster::get, i -> i)); @@ -182,7 +202,7 @@ private List toWeightedEdges(final Collection> refineCluster(final Cluster cluster, - final List> knownClassifications) { + final List> knownClassifications) { if (cluster.size() <= 2) { return Stream.of(cluster); } @@ -200,7 +220,7 @@ private Stream> refineCluster(final Cluster cluster, } private float[][] getKnownWeightMatrix(final Cluster cluster, - final Iterable> knownClassifications) { + final Iterable> knownClassifications) { final var n = cluster.size(); final var weightMatrix = new float[n][n]; for (final var row : weightMatrix) { @@ -221,7 +241,8 @@ private float[][] getKnownWeightMatrix(final Cluster cluster, private Stream> getSubClusters(final byte[] bestClustering, final Cluster cluster) { final Map> subClusters = IntStream.range(0, bestClustering.length) .mapToObj(index -> new AbstractMap.SimpleEntry<>(bestClustering[index], cluster.get(index))) - .collect(Collectors.groupingBy(Map.Entry::getKey, Collectors.mapping(Map.Entry::getValue, Collectors.toList()))); + .collect(Collectors + .groupingBy(Map.Entry::getKey, Collectors.mapping(Map.Entry::getValue, Collectors.toList()))); return subClusters.values().stream() .map(records -> new Cluster<>(this.clusterIdGenerator.apply(records), records)); } @@ -248,7 +269,7 @@ private byte[] greedyCluster(final Cluster cluster, final Collection score) { score = newScore; clustering = newClustering; @@ -261,7 +282,8 @@ private List addRandomEdges(final List edg // add random edges with distance 2..n of known edges (e.g., neighbors of known edges). List lastAddedEdges; final Set weightedEdges = new LinkedHashSet<>(edges); - for (int distance = 2; distance < this.maxSmallClusterSize && weightedEdges.size() < desiredNumEdges; distance++) { + for (int distance = 2; distance < this.maxSmallClusterSize && weightedEdges.size() < desiredNumEdges; + distance++) { lastAddedEdges = edges.stream() .flatMap(e1 -> edges.stream().filter(e1::overlaps).map(e1::getTriangleEdge)) .filter(e -> !weightedEdges.contains(e)) @@ -276,28 +298,13 @@ private List addRandomEdges(final List edg return new ArrayList<>(weightedEdges); } - private List getRandomEdges(final int potentialNumEdges, final int desiredNumEdges) { - final List weightedEdges; - weightedEdges = new Random().ints(0, potentialNumEdges) - .distinct() - .limit(desiredNumEdges) - .mapToObj(i -> { - // reverse of Gaussian - int leftIndex = (int) (Math.sqrt(i + 0.25) - 0.5); - int rightIndex = i - RefineCluster.getNumEdges(leftIndex) + leftIndex; - return WeightedEdge.of(leftIndex, rightIndex, Float.NaN); - }) - .collect(Collectors.toList()); - return weightedEdges; - } - private List getWeightedEdges(final Cluster cluster, - final List duplicates, - final int desiredNumEdges) { + final List duplicates, + final int desiredNumEdges) { final List weightedEdges; if (duplicates.isEmpty()) { final int n = cluster.size(); - weightedEdges = this.getRandomEdges(RefineCluster.getNumEdges(n), desiredNumEdges); + weightedEdges = getRandomEdges(getNumEdges(n), desiredNumEdges); } else { Collections.shuffle(duplicates); weightedEdges = this.addRandomEdges(duplicates, desiredNumEdges); @@ -382,9 +389,10 @@ WeightedEdge getTriangleEdge(final WeightedEdge e) { } boolean overlaps(final WeightedEdge e) { - return e.left == this.left || e.getLeft() == this.right || e.right == this.getLeft() || e.getRight() == this - .getRight(); + return e.left == this.getLeft() || e.getLeft() == this.right || e.getRight() == this.getLeft() + || e.getRight() == this + .getRight(); } } -} \ No newline at end of file +} diff --git a/common/src/main/java/com/bakdata/deduplication/clustering/RefinedTransitiveClosure.java b/common/src/main/java/com/bakdata/deduplication/clustering/RefinedTransitiveClosure.java index fb6b830..cdb12a4 100644 --- a/common/src/main/java/com/bakdata/deduplication/clustering/RefinedTransitiveClosure.java +++ b/common/src/main/java/com/bakdata/deduplication/clustering/RefinedTransitiveClosure.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.clustering; @@ -38,7 +37,8 @@ @Value @Builder -public class RefinedTransitiveClosure, T, I extends Comparable> implements Clustering { +public class RefinedTransitiveClosure, T, I extends Comparable> + implements Clustering { @NonNull RefineCluster refineCluster; @@ -53,12 +53,12 @@ public class RefinedTransitiveClosure, T, I extends Comp @java.beans.ConstructorProperties({"refineCluster", "oldClusterIndex", "closure", "idExtractor"}) RefinedTransitiveClosure(final @NonNull RefineCluster refineCluster, - final Map> oldClusterIndex, final TransitiveClosure closure, - final @NonNull Function idExtractor) { + final Map> oldClusterIndex, final TransitiveClosure closure, + final @NonNull Function idExtractor) { this.refineCluster = refineCluster; this.oldClusterIndex = oldClusterIndex != null ? oldClusterIndex : new HashMap<>(); this.closure = closure != null ? closure - : new TransitiveClosure<>(idExtractor, refineCluster.getClusterIdGenerator(), new HashMap<>()); + : new TransitiveClosure<>(idExtractor, refineCluster.getClusterIdGenerator(), new HashMap<>()); this.idExtractor = idExtractor; } diff --git a/common/src/main/java/com/bakdata/deduplication/clustering/TransitiveClosure.java b/common/src/main/java/com/bakdata/deduplication/clustering/TransitiveClosure.java index 97f38de..daecf37 100644 --- a/common/src/main/java/com/bakdata/deduplication/clustering/TransitiveClosure.java +++ b/common/src/main/java/com/bakdata/deduplication/clustering/TransitiveClosure.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.clustering; @@ -41,7 +40,8 @@ @Value @Builder -public class TransitiveClosure, T, I extends Comparable> implements Clustering { +public class TransitiveClosure, T, I extends Comparable> + implements Clustering { @NonNull Function idExtractor; @NonNull @@ -53,7 +53,8 @@ public class TransitiveClosure, T, I extends Comparable< @Override public List> cluster(final List> classified) { final List> duplicates = classified.stream() - .filter(classifiedCandidate -> classifiedCandidate.getClassification().getResult() == Classification.ClassificationResult.DUPLICATE) + .filter(classifiedCandidate -> classifiedCandidate.getClassification().getResult() + == Classification.ClassificationResult.DUPLICATE) .map(ClassifiedCandidate::getCandidate) .collect(Collectors.toList()); return this.clusterDuplicates(duplicates); @@ -104,7 +105,7 @@ public List> clusterDuplicates(final Iterable> duplic public void removeCluster(final Cluster cluster) { final List recordIds = cluster.getElements().stream() - .map(this.idExtractor) + .map(this.idExtractor) .collect(Collectors.toList()); final Map>> referredCluster = recordIds.stream() .map(this.clusterIndex::get) diff --git a/common/src/main/java/com/bakdata/deduplication/fusion/AnnotatedValue.java b/common/src/main/java/com/bakdata/deduplication/fusion/AnnotatedValue.java index c8d136e..94c4c71 100644 --- a/common/src/main/java/com/bakdata/deduplication/fusion/AnnotatedValue.java +++ b/common/src/main/java/com/bakdata/deduplication/fusion/AnnotatedValue.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.fusion; diff --git a/common/src/main/java/com/bakdata/deduplication/fusion/CommonConflictResolutions.java b/common/src/main/java/com/bakdata/deduplication/fusion/CommonConflictResolutions.java index a928a03..0de7dbb 100644 --- a/common/src/main/java/com/bakdata/deduplication/fusion/CommonConflictResolutions.java +++ b/common/src/main/java/com/bakdata/deduplication/fusion/CommonConflictResolutions.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.fusion; @@ -55,7 +54,8 @@ public static ConflictResolution corresponding(final ResolutionTag }); } - public static ConflictResolution saveAs(final ConflictResolution resolution, final ResolutionTag resolutionTag) { + public static ConflictResolution saveAs(final ConflictResolution resolution, + final ResolutionTag resolutionTag) { return new TaggedResolution<>(resolution, resolutionTag); } @@ -69,7 +69,8 @@ public static Comparator> comparator(final Comparator c public static > ConflictResolution max() { return ((values, context) -> values.stream().max(comparator()) - .map(max -> values.stream().filter(v -> v.getValue().equals(max.getValue())).collect(Collectors.toList())) + .map(max -> values.stream().filter(v -> v.getValue().equals(max.getValue())) + .collect(Collectors.toList())) .orElse(List.of())); } @@ -91,7 +92,8 @@ public static TerminalConflictResolution sum() { } public static TerminalConflictResolution random() { - return ((values, context) -> values.isEmpty() ? Optional.empty() : Optional.of(values.get(random.nextInt(values.size())))); + return ((values, context) -> values.isEmpty() ? Optional.empty() + : Optional.of(values.get(random.nextInt(values.size())))); } public static TerminalConflictResolution first() { @@ -119,20 +121,23 @@ public static > ConflictResolution median() { final List> sorted = new ArrayList<>(values); sorted.sort(comparator()); // create copy of list of median value(s), such that original list is not referenced anymore - return List.copyOf(sorted.subList((int) Math.floor(sorted.size() / 2.0), (int) Math.ceil(sorted.size() / 2.0))); + return List.copyOf(sorted + .subList((int) Math.floor(sorted.size() / 2.0), (int) Math.ceil(sorted.size() / 2.0))); }); } public static ConflictResolution shortest() { return ((values, context) -> values.isEmpty() ? values : - values.stream().collect(Collectors.groupingBy(v -> v.getValue().length(), TreeMap::new, Collectors.toList())) + values.stream() + .collect(Collectors.groupingBy(v -> v.getValue().length(), TreeMap::new, Collectors.toList())) .firstEntry() .getValue()); } public static ConflictResolution longest() { return ((values, context) -> values.isEmpty() ? values : - values.stream().collect(Collectors.groupingBy(v -> v.getValue().length(), TreeMap::new, Collectors.toList())) + values.stream() + .collect(Collectors.groupingBy(v -> v.getValue().length(), TreeMap::new, Collectors.toList())) .lastEntry() .getValue()); } @@ -142,7 +147,8 @@ public static ConflictResolution mostFrequent() { values.stream().collect(Collectors.groupingBy(AnnotatedValue::getValue)) .entrySet() .stream() - .collect(Collectors.groupingBy(entry -> entry.getValue().size(), TreeMap::new, Collectors.toList())) + .collect(Collectors + .groupingBy(entry -> entry.getValue().size(), TreeMap::new, Collectors.toList())) .lastEntry() .getValue() .stream() @@ -152,14 +158,16 @@ public static ConflictResolution mostFrequent() { public static ConflictResolution earliest() { return ((values, context) -> values.isEmpty() ? values : - values.stream().collect(Collectors.groupingBy(AnnotatedValue::getDateTime, TreeMap::new, Collectors.toList())) + values.stream() + .collect(Collectors.groupingBy(AnnotatedValue::getDateTime, TreeMap::new, Collectors.toList())) .firstEntry() .getValue()); } public static ConflictResolution latest() { return ((values, context) -> values.isEmpty() ? values : - values.stream().collect(Collectors.groupingBy(AnnotatedValue::getDateTime, TreeMap::new, Collectors.toList())) + values.stream() + .collect(Collectors.groupingBy(AnnotatedValue::getDateTime, TreeMap::new, Collectors.toList())) .lastEntry() .getValue()); } @@ -201,7 +209,7 @@ public static > TerminalConflictResolution } public static , R extends Collection> TerminalConflictResolution unionAll( - final Supplier ctor) { + final Supplier ctor) { return (annotatedValues, context) -> { final R collection = ctor.get(); for (final AnnotatedValue annotatedValue : annotatedValues) { @@ -223,7 +231,8 @@ public static ConflictResolution transform(final Function> ConflictResolution min() { return ((values, context) -> values.stream().min(comparator()) - .map(min -> values.stream().filter(v -> v.getValue().equals(min.getValue())).collect(Collectors.toList())) + .map(min -> values.stream().filter(v -> v.getValue().equals(min.getValue())) + .collect(Collectors.toList())) .orElse(List.of())); } @@ -233,10 +242,11 @@ static class TaggedResolution implements ConflictResolution { private final ResolutionTag resolutionTag; @Override - public List> resolvePartially(final List> values, final FusionContext context) { + public List> resolvePartially(final List> values, + final FusionContext context) { final List> annotatedValues = this.resolution.resolvePartially(values, context); context.storeValues(this.resolutionTag, annotatedValues); return annotatedValues; } } -} \ No newline at end of file +} diff --git a/common/src/main/java/com/bakdata/deduplication/fusion/ConflictResolution.java b/common/src/main/java/com/bakdata/deduplication/fusion/ConflictResolution.java index 071c6c1..b1af695 100644 --- a/common/src/main/java/com/bakdata/deduplication/fusion/ConflictResolution.java +++ b/common/src/main/java/com/bakdata/deduplication/fusion/ConflictResolution.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.fusion; @@ -40,7 +39,8 @@ default Optional resolve(final List> values, final FusionCo case 1: return Optional.of(resolvedValues.get(0).getValue()); default: - final var uniqueValues = resolvedValues.stream().map(AnnotatedValue::getValue).distinct().collect(Collectors.toList()); + final var uniqueValues = + resolvedValues.stream().map(AnnotatedValue::getValue).distinct().collect(Collectors.toList()); if (uniqueValues.size() == 1) { return Optional.of(uniqueValues.get(0)); } @@ -50,6 +50,7 @@ default Optional resolve(final List> values, final FusionCo default ConflictResolution andThen(final ConflictResolution successor) { final var predecessor = this; - return ((values, context) -> successor.resolvePartially(predecessor.resolvePartially(values, context), context)); + return ((values, context) -> successor + .resolvePartially(predecessor.resolvePartially(values, context), context)); } } diff --git a/common/src/main/java/com/bakdata/deduplication/fusion/ConflictResolutionFusion.java b/common/src/main/java/com/bakdata/deduplication/fusion/ConflictResolutionFusion.java index 3f566d5..93d5cb5 100644 --- a/common/src/main/java/com/bakdata/deduplication/fusion/ConflictResolutionFusion.java +++ b/common/src/main/java/com/bakdata/deduplication/fusion/ConflictResolutionFusion.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.fusion; @@ -58,14 +57,17 @@ public FusedValue fuse(final Cluster cluster) { .map(e -> new AnnotatedValue<>(e, this.getSource(e), this.lastModifiedExtractor.apply(e))) .collect(Collectors.toList()); final FusionContext context = new FusionContext(); - final T resolvedValue = context.safeExecute(() -> this.rootResolution.resolve(conflictingValues, context)).flatMap(r -> r) - .orElseThrow(() -> this.createException(conflictingValues, context)); + final T resolvedValue = + context.safeExecute(() -> this.rootResolution.resolve(conflictingValues, context)).flatMap(r -> r) + .orElseThrow(() -> this.createException(conflictingValues, context)); return new FusedValue<>(resolvedValue, cluster, context.getExceptions()); } - private FusionException createException(final List> conflictingValues, final FusionContext context) { - final FusionException fusionException = new FusionException("Could not resolve conflict in " + conflictingValues, - context.getExceptions().get(0)); + private FusionException createException(final List> conflictingValues, + final FusionContext context) { + final FusionException fusionException = + new FusionException("Could not resolve conflict in " + conflictingValues, + context.getExceptions().get(0)); context.getExceptions().stream().skip(1).forEach(fusionException::addSuppressed); return fusionException; } diff --git a/common/src/main/java/com/bakdata/deduplication/fusion/ConflictResolutions.java b/common/src/main/java/com/bakdata/deduplication/fusion/ConflictResolutions.java index 9c8c71c..ef28d0b 100644 --- a/common/src/main/java/com/bakdata/deduplication/fusion/ConflictResolutions.java +++ b/common/src/main/java/com/bakdata/deduplication/fusion/ConflictResolutions.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,24 +20,11 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.fusion; -import com.bakdata.util.FunctionalClass; -import com.bakdata.util.ObjectUtils; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.List; -import java.util.Optional; -import java.util.function.BiConsumer; -import java.util.function.Function; import java.util.function.Supplier; -import java.util.stream.Collectors; -import lombok.Value; import lombok.experimental.UtilityClass; -import lombok.experimental.Wither; @UtilityClass public class ConflictResolutions { @@ -50,186 +37,4 @@ public static Merge.MergeBuilder merge(final Class clazz) { return Merge.builder(clazz); } - @Value - public static class Merge implements ConflictResolution { - private final Supplier ctor; - private final List> fieldMerges; - - @SuppressWarnings("unchecked") - static MergeBuilder builder(final Supplier ctor) { - return new MergeBuilder<>(ctor, FunctionalClass.from((Class) ctor.get().getClass())); - } - - static MergeBuilder builder(final Class clazz) { - final FunctionalClass f = FunctionalClass.from(clazz); - return new MergeBuilder<>(f.getConstructor(), f); - } - - @Override - public List> resolvePartially(final List> annotatedValues, final FusionContext context) { - final R r = this.ctor.get(); - for (final FieldMerge fieldMerge : this.fieldMerges) { - fieldMerge.mergeInto(r, annotatedValues, context); - } - return List.of(AnnotatedValue.calculated(r)); - } - - @Value - private static class FieldMerge { - Function getter; - BiConsumer setter; - @Wither - ConflictResolution resolution; - - void mergeInto(final R r, final Collection> annotatedValues, final FusionContext context) { - final List> fieldValues = annotatedValues.stream() - .map(ar -> ar.withValue(this.getter.apply(ar.getValue()))) - .filter(ar -> ObjectUtils.isNonEmpty(ar.getValue())) - .collect(Collectors.toList()); - context.safeExecute(() -> { - final Optional resolvedValue = this.resolution.resolve(fieldValues, context); - resolvedValue.ifPresent(v -> this.setter.accept(r, v)); - }); - } - } - - @Value - public static class MergeBuilder { - Supplier ctor; - FunctionalClass clazz; - List> fieldMerges = new ArrayList<>(); - - public FieldMergeBuilder field(final Function getter, final BiConsumer setter) { - return new FieldMergeBuilder<>(this, getter, setter); - } - - public FieldMergeBuilder field(final FunctionalClass.Field field) { - final Function getter = field.getGetter(); - final BiConsumer setter = field.getSetter(); - return this.field(getter, setter); - } - - public FieldMergeBuilder field(final String name) { - final FunctionalClass.Field field = this.clazz.field(name); - return this.field(field); - } - - void replaceLast(final FieldMerge fieldMerge) { - this.fieldMerges.set(this.fieldMerges.size() - 1, fieldMerge); - } - - @SuppressWarnings("squid:S1452") - FieldMerge getLast() { - if (this.fieldMerges.isEmpty()) { - throw new IllegalStateException(); - } - return this.fieldMerges.get(this.fieldMerges.size() - 1); - } - - public ConflictResolution build() { - return new Merge<>(this.ctor, this.fieldMerges); - } - - private void add(final FieldMerge fieldMerge) { - this.fieldMerges.add(fieldMerge); - } - } - - @Value - public static class FieldMergeBuilder { - MergeBuilder mergeBuilder; - Function getter; - BiConsumer setter; - - public AdditionalFieldMergeBuilder with(final ConflictResolution resolution) { - return new AdditionalFieldMergeBuilder<>(this.convertingWith(resolution)); - } - - @SafeVarargs - public final AdditionalFieldMergeBuilder with(final ConflictResolution resolution, final ConflictResolution... resolutions) { - return this.with(Arrays.stream(resolutions).reduce(resolution, ConflictResolution::andThen)); - } - - public IllTypedFieldMergeBuilder convertingWith(final ConflictResolution resolution) { - return new IllTypedFieldMergeBuilder<>(this, resolution); - } - - public AdditionalFieldMergeBuilder corresponding(final ResolutionTag tag) { - return this.with(CommonConflictResolutions.corresponding(tag)); - } - - @SuppressWarnings("unchecked") - public AdditionalFieldMergeBuilder correspondingToPrevious() { - final var last = this.mergeBuilder.getLast(); - final ResolutionTag tag; - // auto tag previous merge if it is not tagged already - if (last.getResolution() instanceof CommonConflictResolutions.TaggedResolution) { - tag = ((CommonConflictResolutions.TaggedResolution) last.getResolution()).getResolutionTag(); - } else { - final var fieldMerges = this.mergeBuilder.getFieldMerges(); - tag = new ResolutionTag<>("tag-" + System.identityHashCode(fieldMerges) + "-" + fieldMerges.size()); - this.mergeBuilder.replaceLast(last.withResolution( - CommonConflictResolutions.saveAs(last.getResolution(), tag))); - } - return this.corresponding(tag); - } - - void finish(final ConflictResolution resolution) { - this.mergeBuilder.add(new FieldMerge<>(this.getter, this.setter, resolution)); - } - } - - @Value - public static class IllTypedFieldMergeBuilder { - FieldMergeBuilder fieldMergeBuilder; - ConflictResolution resolution; - - public IllTypedFieldMergeBuilder then(final ConflictResolution resolution) { - return new IllTypedFieldMergeBuilder<>(this.fieldMergeBuilder, this.resolution.andThen(resolution)); - } - - public AdditionalFieldMergeBuilder convertingBack(final ConflictResolution resolution) { - return new AdditionalFieldMergeBuilder<>(this.then(resolution)); - } - - MergeBuilder getMergeBuilder() { - return this.getFieldMergeBuilder().getMergeBuilder(); - } - } - - @Value - public static class AdditionalFieldMergeBuilder { - IllTypedFieldMergeBuilder inner; - - public FieldMergeBuilder field(final Function getter, final BiConsumer setter) { - this.inner.getFieldMergeBuilder().finish(this.inner.getResolution()); - return new FieldMergeBuilder<>(this.inner.getMergeBuilder(), getter, setter); - } - - public FieldMergeBuilder field(final FunctionalClass.Field field) { - final Function getter = field.getGetter(); - final BiConsumer setter = field.getSetter(); - return this.field(getter, setter); - } - - public FieldMergeBuilder field(final String name) { - final FunctionalClass clazz = this.inner.getMergeBuilder().getClazz(); - final FunctionalClass.Field field = clazz.field(name); - return this.field(field); - } - - public IllTypedFieldMergeBuilder convertingWith(final ConflictResolution resolution) { - return this.inner.then(resolution); - } - - public AdditionalFieldMergeBuilder then(final ConflictResolution resolution) { - return new AdditionalFieldMergeBuilder<>(this.inner.then(resolution)); - } - - public ConflictResolution build() { - this.inner.getFieldMergeBuilder().finish(this.inner.getResolution()); - return this.inner.getMergeBuilder().build(); - } - } - } } diff --git a/common/src/main/java/com/bakdata/deduplication/fusion/FusionContext.java b/common/src/main/java/com/bakdata/deduplication/fusion/FusionContext.java index b57f6ca..6d6ea32 100644 --- a/common/src/main/java/com/bakdata/deduplication/fusion/FusionContext.java +++ b/common/src/main/java/com/bakdata/deduplication/fusion/FusionContext.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.fusion; diff --git a/common/src/main/java/com/bakdata/deduplication/fusion/Merge.java b/common/src/main/java/com/bakdata/deduplication/fusion/Merge.java new file mode 100644 index 0000000..80b64a6 --- /dev/null +++ b/common/src/main/java/com/bakdata/deduplication/fusion/Merge.java @@ -0,0 +1,228 @@ +/* + * MIT License + * + * Copyright (c) 2019 bakdata GmbH + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.bakdata.deduplication.fusion; + +import com.bakdata.util.FunctionalClass; +import com.bakdata.util.ObjectUtils; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.Optional; +import java.util.function.BiConsumer; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import lombok.AccessLevel; +import lombok.RequiredArgsConstructor; +import lombok.Value; +import lombok.experimental.Wither; + +@Value +@RequiredArgsConstructor(access = AccessLevel.PRIVATE) +public class Merge implements ConflictResolution { + private final Supplier ctor; + private final List> fieldMerges; + + @SuppressWarnings("unchecked") + static MergeBuilder builder(final Supplier ctor) { + return new MergeBuilder<>(ctor, FunctionalClass.from((Class) ctor.get().getClass())); + } + + static MergeBuilder builder(final Class clazz) { + final FunctionalClass f = FunctionalClass.from(clazz); + return new MergeBuilder<>(f.getConstructor(), f); + } + + @Override + public List> resolvePartially(final List> annotatedValues, + final FusionContext context) { + final R r = this.ctor.get(); + for (final FieldMerge fieldMerge : this.fieldMerges) { + fieldMerge.mergeInto(r, annotatedValues, context); + } + return List.of(AnnotatedValue.calculated(r)); + } + + @Value + private static class FieldMerge { + Function getter; + BiConsumer setter; + @Wither + ConflictResolution resolution; + + void mergeInto(final R r, final Collection> annotatedValues, + final FusionContext context) { + final List> fieldValues = annotatedValues.stream() + .map(ar -> ar.withValue(this.getter.apply(ar.getValue()))) + .filter(ar -> ObjectUtils.isNonEmpty(ar.getValue())) + .collect(Collectors.toList()); + context.safeExecute(() -> { + final Optional resolvedValue = this.resolution.resolve(fieldValues, context); + resolvedValue.ifPresent(v -> this.setter.accept(r, v)); + }); + } + } + + @Value + public static class MergeBuilder { + Supplier ctor; + FunctionalClass clazz; + List> fieldMerges = new ArrayList<>(); + + public FieldMergeBuilder field(final Function getter, final BiConsumer setter) { + return new FieldMergeBuilder<>(this, getter, setter); + } + + public FieldMergeBuilder field(final FunctionalClass.Field field) { + final Function getter = field.getGetter(); + final BiConsumer setter = field.getSetter(); + return this.field(getter, setter); + } + + public FieldMergeBuilder field(final String name) { + final FunctionalClass.Field field = this.clazz.field(name); + return this.field(field); + } + + void replaceLast(final FieldMerge fieldMerge) { + this.fieldMerges.set(this.fieldMerges.size() - 1, fieldMerge); + } + + @SuppressWarnings("squid:S1452") + FieldMerge getLast() { + if (this.fieldMerges.isEmpty()) { + throw new IllegalStateException(); + } + return this.fieldMerges.get(this.fieldMerges.size() - 1); + } + + public ConflictResolution build() { + return new Merge<>(this.ctor, this.fieldMerges); + } + + private void add(final FieldMerge fieldMerge) { + this.fieldMerges.add(fieldMerge); + } + } + + @Value + public static class FieldMergeBuilder { + MergeBuilder mergeBuilder; + Function getter; + BiConsumer setter; + + public AdditionalFieldMergeBuilder with(final ConflictResolution resolution) { + return new AdditionalFieldMergeBuilder<>(this.convertingWith(resolution)); + } + + @SafeVarargs + public final AdditionalFieldMergeBuilder with(final ConflictResolution resolution, + final ConflictResolution... resolutions) { + return this.with(Arrays.stream(resolutions).reduce(resolution, ConflictResolution::andThen)); + } + + public IllTypedFieldMergeBuilder convertingWith(final ConflictResolution resolution) { + return new IllTypedFieldMergeBuilder<>(this, resolution); + } + + public AdditionalFieldMergeBuilder corresponding(final ResolutionTag tag) { + return this.with(CommonConflictResolutions.corresponding(tag)); + } + + @SuppressWarnings("unchecked") + public AdditionalFieldMergeBuilder correspondingToPrevious() { + final var last = this.mergeBuilder.getLast(); + final ResolutionTag tag; + // auto tag previous merge if it is not tagged already + if (last.getResolution() instanceof CommonConflictResolutions.TaggedResolution) { + tag = ((CommonConflictResolutions.TaggedResolution) last.getResolution()).getResolutionTag(); + } else { + final var fieldMerges = this.mergeBuilder.getFieldMerges(); + tag = new ResolutionTag<>("tag-" + System.identityHashCode(fieldMerges) + "-" + fieldMerges.size()); + this.mergeBuilder.replaceLast(last.withResolution( + CommonConflictResolutions.saveAs(last.getResolution(), tag))); + } + return this.corresponding(tag); + } + + void finish(final ConflictResolution resolution) { + this.mergeBuilder.add(new FieldMerge<>(this.getter, this.setter, resolution)); + } + } + + @Value + public static class IllTypedFieldMergeBuilder { + FieldMergeBuilder fieldMergeBuilder; + ConflictResolution resolution; + + public IllTypedFieldMergeBuilder then(final ConflictResolution resolution) { + return new IllTypedFieldMergeBuilder<>(this.fieldMergeBuilder, this.resolution.andThen(resolution)); + } + + public AdditionalFieldMergeBuilder convertingBack(final ConflictResolution resolution) { + return new AdditionalFieldMergeBuilder<>(this.then(resolution)); + } + + MergeBuilder getMergeBuilder() { + return this.getFieldMergeBuilder().getMergeBuilder(); + } + } + + @Value + public static class AdditionalFieldMergeBuilder { + IllTypedFieldMergeBuilder inner; + + public FieldMergeBuilder field(final Function getter, final BiConsumer setter) { + this.inner.getFieldMergeBuilder().finish(this.inner.getResolution()); + return new FieldMergeBuilder<>(this.inner.getMergeBuilder(), getter, setter); + } + + public FieldMergeBuilder field(final FunctionalClass.Field field) { + final Function getter = field.getGetter(); + final BiConsumer setter = field.getSetter(); + return this.field(getter, setter); + } + + public FieldMergeBuilder field(final String name) { + final FunctionalClass clazz = this.inner.getMergeBuilder().getClazz(); + final FunctionalClass.Field field = clazz.field(name); + return this.field(field); + } + + public IllTypedFieldMergeBuilder convertingWith(final ConflictResolution resolution) { + return this.inner.then(resolution); + } + + public AdditionalFieldMergeBuilder then(final ConflictResolution resolution) { + return new AdditionalFieldMergeBuilder<>(this.inner.then(resolution)); + } + + public ConflictResolution build() { + this.inner.getFieldMergeBuilder().finish(this.inner.getResolution()); + return this.inner.getMergeBuilder().build(); + } + } +} diff --git a/common/src/main/java/com/bakdata/deduplication/fusion/ResolutionPath.java b/common/src/main/java/com/bakdata/deduplication/fusion/ResolutionPath.java index 3a8d1e6..c15d2d7 100644 --- a/common/src/main/java/com/bakdata/deduplication/fusion/ResolutionPath.java +++ b/common/src/main/java/com/bakdata/deduplication/fusion/ResolutionPath.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.fusion; @@ -37,11 +36,12 @@ public class ResolutionPath implements ConflictResolution { ConflictResolution resolution; @Override - public List> resolvePartially(final List> annotatedValues, final FusionContext context) { + public List> resolvePartially(final List> annotatedValues, + final FusionContext context) { final List> fieldValues = annotatedValues.stream() - .map(ar -> ar.withValue(this.extractor.apply(ar.getValue()))) - .filter(ar -> isNonEmpty(ar.getValue())) - .collect(Collectors.toList()); + .map(ar -> ar.withValue(this.extractor.apply(ar.getValue()))) + .filter(ar -> isNonEmpty(ar.getValue())) + .collect(Collectors.toList()); return this.resolution.resolvePartially(fieldValues, context); } } diff --git a/common/src/main/java/com/bakdata/deduplication/fusion/ResolutionTag.java b/common/src/main/java/com/bakdata/deduplication/fusion/ResolutionTag.java index a1baa75..4a64d1a 100644 --- a/common/src/main/java/com/bakdata/deduplication/fusion/ResolutionTag.java +++ b/common/src/main/java/com/bakdata/deduplication/fusion/ResolutionTag.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.fusion; diff --git a/common/src/main/java/com/bakdata/deduplication/fusion/Source.java b/common/src/main/java/com/bakdata/deduplication/fusion/Source.java index 7b88f6c..93eb97f 100644 --- a/common/src/main/java/com/bakdata/deduplication/fusion/Source.java +++ b/common/src/main/java/com/bakdata/deduplication/fusion/Source.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.fusion; diff --git a/common/src/main/java/com/bakdata/deduplication/fusion/TerminalConflictResolution.java b/common/src/main/java/com/bakdata/deduplication/fusion/TerminalConflictResolution.java index 895f0d2..24ba400 100644 --- a/common/src/main/java/com/bakdata/deduplication/fusion/TerminalConflictResolution.java +++ b/common/src/main/java/com/bakdata/deduplication/fusion/TerminalConflictResolution.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.fusion; @@ -29,7 +28,8 @@ @FunctionalInterface public interface TerminalConflictResolution extends ConflictResolution { - default List> resolvePartially(final List> values, final FusionContext context) { + default List> resolvePartially(final List> values, + final FusionContext context) { return this.resolveFully(values, context).map(List::of).orElse(List.of()); } diff --git a/common/src/main/java/com/bakdata/deduplication/similarity/CommonSimilarityMeasures.java b/common/src/main/java/com/bakdata/deduplication/similarity/CommonSimilarityMeasures.java index 7e614a0..5695aff 100644 --- a/common/src/main/java/com/bakdata/deduplication/similarity/CommonSimilarityMeasures.java +++ b/common/src/main/java/com/bakdata/deduplication/similarity/CommonSimilarityMeasures.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.similarity; @@ -53,7 +52,6 @@ import org.apache.commons.codec.language.Soundex; import org.apache.commons.codec.language.bm.BeiderMorseEncoder; import org.apache.commons.text.similarity.JaroWinklerDistance; -import org.apache.commons.text.similarity.LevenshteinDistance; import org.apache.commons.text.similarity.SimilarityScore; @UtilityClass @@ -76,9 +74,9 @@ public static SimilarityMeasure inequality() { public static > SimilarityMeasure jaccard() { return (left, right, context) -> { @SuppressWarnings("unchecked") final Set leftSet = - left instanceof Set ? (Set) left : new HashSet<>(left); + left instanceof Set ? (Set) left : new HashSet<>(left); @SuppressWarnings("unchecked") final Set rightSet = - left instanceof Set ? (Set) right : new HashSet<>(right); + left instanceof Set ? (Set) right : new HashSet<>(right); final long intersectCount = leftSet.stream().filter(rightSet::contains).count(); return (float) intersectCount / (rightSet.size() + leftSet.size() - intersectCount); }; @@ -92,7 +90,8 @@ public static SimilarityMeasure jaroWinkler() { return new SimilarityScoreMeasure<>(new JaroWinklerDistance()); } - public static > SimilarityMeasure mongeElkan(final SimilarityMeasure pairMeasure) { + public static > SimilarityMeasure mongeElkan( + final SimilarityMeasure pairMeasure) { return mongeElkan(pairMeasure, Integer.MAX_VALUE / 2); } @@ -103,9 +102,9 @@ public static > SimilarityMeasure cosine return unknown(); } final Map leftHistogram = - left.stream().collect(Collectors.groupingBy(w -> w, Collectors.counting())); + left.stream().collect(Collectors.groupingBy(w -> w, Collectors.counting())); final Map rightHistogram = - right.stream().collect(Collectors.groupingBy(w -> w, Collectors.counting())); + right.stream().collect(Collectors.groupingBy(w -> w, Collectors.counting())); float dotProduct = 0; for (final Map.Entry leftEntry : leftHistogram.entrySet()) { final Long rightCount = rightHistogram.get(leftEntry.getKey()); @@ -122,7 +121,8 @@ private static float getLength(final Map histogram) { } - public static > SimilarityMeasure mongeElkan(final SimilarityMeasure pairMeasure, final int maxPositionDiff) { + public static > SimilarityMeasure mongeElkan( + final SimilarityMeasure pairMeasure, final int maxPositionDiff) { return new MongeElkan<>(pairMeasure, maxPositionDiff, 0); } @@ -130,12 +130,8 @@ public static SimilarityMeasure negate(final SimilarityMeasure return (left, right, context) -> 1 - measure.getSimilarity(left, right, context); } - @SuppressWarnings("unchecked") - private static > List ensureList(final C leftCollection) { - return leftCollection instanceof List ? (List) leftCollection : List.copyOf(leftCollection); - } - - public static > SimilarityMeasure positionWise(final SimilarityMeasure pairMeasure) { + public static > SimilarityMeasure positionWise( + final SimilarityMeasure pairMeasure) { return mongeElkan(pairMeasure, 0); } @@ -222,36 +218,21 @@ public static SimilarityTransformation WeightedAggregation.WeightedAggregationBuilder weightedAggregation(final BiFunction, List, Float> aggregator) { + public static WeightedAggregation.WeightedAggregationBuilder weightedAggregation( + final BiFunction, List, Float> aggregator) { return WeightedAggregation.builder().aggregator(aggregator); } public static WeightedAggregation.WeightedAggregationBuilder weightedAverage() { return weightedAggregation((weightedSims, weights) -> - (float) (weightedSims.stream().mapToDouble(sim -> sim).sum() / weights.stream().mapToDouble(w -> w).sum())); + (float) (weightedSims.stream().mapToDouble(sim -> sim).sum() / weights.stream().mapToDouble(w -> w) + .sum())); } static int getMaxLen(final CharSequence left, final CharSequence right) { return Math.max(left.length(), right.length()); } - /** - * Used to translate {@link SimilarityScore} that are actually distance functions to similarity scores - */ - @RequiredArgsConstructor - public static class DistanceSimilarityMeasure implements SimilarityMeasure { - private final SimilarityScore score; - - @Override - public float getSimilarity(final CharSequence left, final CharSequence right, final SimilarityContext context) { - final float distance = this.score.apply(left, right).floatValue(); - if (distance == -1) { - return 0; - } - return 1.0f - distance / getMaxLen(left, right); - } - } - @RequiredArgsConstructor public static class SimilarityScoreMeasure implements SimilarityMeasure { private final SimilarityScore score; @@ -262,30 +243,6 @@ public float getSimilarity(final CharSequence left, final CharSequence right, fi } } - public static class Levensthein implements SimilarityMeasure { - private final float threshold; - - public Levensthein(final float threshold) { - this.threshold = threshold; - } - - @Override - public float getSimilarity(final CharSequence left, final CharSequence right, final SimilarityContext context) { - final var maxLen = getMaxLen(left, right); - final var maxDiff = (int) (maxLen * (1 - this.threshold)); - final var measure = new DistanceSimilarityMeasure(new LevenshteinDistance(maxDiff)); - return measure.getSimilarity(left, right, context); - } - - @Override - public SimilarityMeasure cutoff(final float threshold) { - if (threshold < this.threshold) { - return this; - } - return new Levensthein<>(threshold); - } - } - @Builder @Value public static class WeightedAggregation implements SimilarityMeasure { @@ -293,7 +250,8 @@ public static class WeightedAggregation implements SimilarityMeasure { @Singular List> weightedSimilarities; @Getter(lazy = true) - List weights = this.weightedSimilarities.stream().map(WeightedSimilarity::getWeight).collect(Collectors.toList()); + List weights = + this.weightedSimilarities.stream().map(WeightedSimilarity::getWeight).collect(Collectors.toList()); @Override public float getSimilarity(final R left, final R right, final SimilarityContext context) { @@ -320,7 +278,8 @@ public static class WeightedSimilarity { } public static class WeightedAggregationBuilder { - public WeightedAggregationBuilder add(final float weight, final Function extractor, final SimilarityMeasure measure) { + public WeightedAggregationBuilder add(final float weight, final Function extractor, + final SimilarityMeasure measure) { return this.add(weight, measure.of(extractor)); } @@ -330,38 +289,4 @@ public WeightedAggregationBuilder add(final float weight, final SimilarityMea } } - @Value - private static class MongeElkan, T> implements SimilarityMeasure { - private final SimilarityMeasure pairMeasure; - private final int maxPositionDiff; - private final float cutoff; - - @Override - public float getSimilarity(final C leftCollection, final C rightCollection, final SimilarityContext context) { - if (leftCollection.isEmpty() || rightCollection.isEmpty()) { - return 0; - } - final List leftList = ensureList(leftCollection); - final List rightList = ensureList(rightCollection); - // when cutoff is .9 and |left| = 3, then on average each element has .1 buffer - // as soon as the current sum + buffer < index, the cutoff threshold cannot be passed (buffer used up) - final float cutoffBuffer = (1 - this.cutoff) * leftCollection.size(); - float sum = 0; - for (int leftIndex = 0; leftIndex < leftCollection.size() && (cutoffBuffer + sum) >= leftIndex; leftIndex++) { - float max = 0; - for (int rightIndex = Math.max(0, leftIndex - this.maxPositionDiff), - rightMax = Math.min(rightCollection.size(), leftIndex + this.maxPositionDiff); max < 1.0 && rightIndex < rightMax; rightIndex++) { - max = Math.max(max, this.pairMeasure - .getSimilarity(leftList.get(leftIndex), rightList.get(rightIndex), context)); - } - sum += max; - } - return CutoffSimiliarityMeasure.cutoff(sum / leftCollection.size(), this.cutoff); - } - - @Override - public SimilarityMeasure cutoff(final float threshold) { - return new MongeElkan<>(this.pairMeasure, this.maxPositionDiff, threshold); - } - } } diff --git a/common/src/main/java/com/bakdata/deduplication/similarity/DistanceSimilarityMeasure.java b/common/src/main/java/com/bakdata/deduplication/similarity/DistanceSimilarityMeasure.java new file mode 100644 index 0000000..495ffc0 --- /dev/null +++ b/common/src/main/java/com/bakdata/deduplication/similarity/DistanceSimilarityMeasure.java @@ -0,0 +1,45 @@ +/* + * MIT License + * + * Copyright (c) 2019 bakdata GmbH + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.bakdata.deduplication.similarity; + +import lombok.RequiredArgsConstructor; +import org.apache.commons.text.similarity.SimilarityScore; + +/** + * Used to translate {@link SimilarityScore} that are actually distance functions to similarity scores + */ +@RequiredArgsConstructor +public class DistanceSimilarityMeasure implements SimilarityMeasure { + private final SimilarityScore score; + + @Override + public float getSimilarity(final CharSequence left, final CharSequence right, final SimilarityContext context) { + final float distance = this.score.apply(left, right).floatValue(); + if (distance == -1) { + return 0; + } + return 1.0f - distance / CommonSimilarityMeasures.getMaxLen(left, right); + } +} diff --git a/common/src/main/java/com/bakdata/deduplication/similarity/Levensthein.java b/common/src/main/java/com/bakdata/deduplication/similarity/Levensthein.java new file mode 100644 index 0000000..c56107a --- /dev/null +++ b/common/src/main/java/com/bakdata/deduplication/similarity/Levensthein.java @@ -0,0 +1,51 @@ +/* + * MIT License + * + * Copyright (c) 2019 bakdata GmbH + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.bakdata.deduplication.similarity; + +import org.apache.commons.text.similarity.LevenshteinDistance; + +public class Levensthein implements SimilarityMeasure { + private final float threshold; + + public Levensthein(final float threshold) { + this.threshold = threshold; + } + + @Override + public float getSimilarity(final CharSequence left, final CharSequence right, final SimilarityContext context) { + final var maxLen = CommonSimilarityMeasures.getMaxLen(left, right); + final var maxDiff = (int) (maxLen * (1 - this.threshold)); + final var measure = new DistanceSimilarityMeasure(new LevenshteinDistance(maxDiff)); + return measure.getSimilarity(left, right, context); + } + + @Override + public SimilarityMeasure cutoff(final float threshold) { + if (threshold < this.threshold) { + return this; + } + return new Levensthein<>(threshold); + } +} diff --git a/common/src/main/java/com/bakdata/deduplication/similarity/MongeElkan.java b/common/src/main/java/com/bakdata/deduplication/similarity/MongeElkan.java new file mode 100644 index 0000000..490a367 --- /dev/null +++ b/common/src/main/java/com/bakdata/deduplication/similarity/MongeElkan.java @@ -0,0 +1,71 @@ +/* + * MIT License + * + * Copyright (c) 2019 bakdata GmbH + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.bakdata.deduplication.similarity; + +import java.util.Collection; +import java.util.List; +import lombok.Value; + +@Value +class MongeElkan, T> implements SimilarityMeasure { + private final SimilarityMeasure pairMeasure; + private final int maxPositionDiff; + private final float cutoff; + + @SuppressWarnings("unchecked") + private static > List ensureList(final C leftCollection) { + return leftCollection instanceof List ? (List) leftCollection : List.copyOf(leftCollection); + } + + @Override + public float getSimilarity(final C leftCollection, final C rightCollection, final SimilarityContext context) { + if (leftCollection.isEmpty() || rightCollection.isEmpty()) { + return 0; + } + final List leftList = ensureList(leftCollection); + final List rightList = ensureList(rightCollection); + // when cutoff is .9 and |left| = 3, then on average each element has .1 buffer + // as soon as the current sum + buffer < index, the cutoff threshold cannot be passed (buffer used up) + final float cutoffBuffer = (1 - this.cutoff) * leftCollection.size(); + float sum = 0; + for (int leftIndex = 0; leftIndex < leftCollection.size() && (cutoffBuffer + sum) >= leftIndex; + leftIndex++) { + float max = 0; + for (int rightIndex = Math.max(0, leftIndex - this.maxPositionDiff), + rightMax = Math.min(rightCollection.size(), leftIndex + this.maxPositionDiff); + max < 1.0 && rightIndex < rightMax; rightIndex++) { + max = Math.max(max, this.pairMeasure + .getSimilarity(leftList.get(leftIndex), rightList.get(rightIndex), context)); + } + sum += max; + } + return CutoffSimiliarityMeasure.cutoff(sum / leftCollection.size(), this.cutoff); + } + + @Override + public SimilarityMeasure cutoff(final float threshold) { + return new MongeElkan<>(this.pairMeasure, this.maxPositionDiff, threshold); + } +} diff --git a/common/src/main/java/com/bakdata/util/ObjectUtils.java b/common/src/main/java/com/bakdata/util/ObjectUtils.java index a721273..68a75bd 100644 --- a/common/src/main/java/com/bakdata/util/ObjectUtils.java +++ b/common/src/main/java/com/bakdata/util/ObjectUtils.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.util; diff --git a/common/src/test/java/com/bakdata/deduplication/fusion/ConflictResolutionsTest.java b/common/src/test/java/com/bakdata/deduplication/fusion/ConflictResolutionsTest.java index a222e28..dcfc3b0 100644 --- a/common/src/test/java/com/bakdata/deduplication/fusion/ConflictResolutionsTest.java +++ b/common/src/test/java/com/bakdata/deduplication/fusion/ConflictResolutionsTest.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.fusion; @@ -34,9 +33,9 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatExceptionOfType; -import com.bakdata.deduplication.fusion.ConflictResolutions.Merge.AdditionalFieldMergeBuilder; -import com.bakdata.deduplication.fusion.ConflictResolutions.Merge.FieldMergeBuilder; -import com.bakdata.deduplication.fusion.ConflictResolutions.Merge.MergeBuilder; +import com.bakdata.deduplication.fusion.Merge.AdditionalFieldMergeBuilder; +import com.bakdata.deduplication.fusion.Merge.FieldMergeBuilder; +import com.bakdata.deduplication.fusion.Merge.MergeBuilder; import com.bakdata.util.FunctionalClass; import com.google.common.collect.Sets; import java.beans.IntrospectionException; @@ -295,7 +294,7 @@ private static final class PersonWithoutGetter { private static final class PersonWithoutSetter { @Getter - private String id = null; + private String id; } -} \ No newline at end of file +} diff --git a/core/build.gradle.kts b/core/build.gradle.kts index 40007a8..4cba52c 100644 --- a/core/build.gradle.kts +++ b/core/build.gradle.kts @@ -1,4 +1,4 @@ description = "Base interfaces and data structures for defining a deduplication workflow." dependencies { -} \ No newline at end of file +} diff --git a/core/src/main/java/com/bakdata/deduplication/ExceptionContext.java b/core/src/main/java/com/bakdata/deduplication/ExceptionContext.java index 2a9c5f7..020a7cb 100644 --- a/core/src/main/java/com/bakdata/deduplication/ExceptionContext.java +++ b/core/src/main/java/com/bakdata/deduplication/ExceptionContext.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication; diff --git a/core/src/main/java/com/bakdata/deduplication/candidate_selection/Candidate.java b/core/src/main/java/com/bakdata/deduplication/candidate_selection/Candidate.java index cae9834..1df0abc 100644 --- a/core/src/main/java/com/bakdata/deduplication/candidate_selection/Candidate.java +++ b/core/src/main/java/com/bakdata/deduplication/candidate_selection/Candidate.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.candidate_selection; @@ -32,7 +31,7 @@ public class Candidate { T oldRecord; public static > Candidate normalized(final T record1, final T record2) { - if(record1.compareTo(record2) <= 0) { + if (record1.compareTo(record2) <= 0) { return new Candidate<>(record1, record2); } return new Candidate<>(record2, record1); diff --git a/core/src/main/java/com/bakdata/deduplication/candidate_selection/offline/OfflineCandidateSelection.java b/core/src/main/java/com/bakdata/deduplication/candidate_selection/offline/OfflineCandidateSelection.java index 2940fbf..72bfd87 100644 --- a/core/src/main/java/com/bakdata/deduplication/candidate_selection/offline/OfflineCandidateSelection.java +++ b/core/src/main/java/com/bakdata/deduplication/candidate_selection/offline/OfflineCandidateSelection.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,12 +20,10 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.candidate_selection.offline; import com.bakdata.deduplication.candidate_selection.Candidate; - import java.util.List; @FunctionalInterface diff --git a/core/src/main/java/com/bakdata/deduplication/candidate_selection/online/OnlineCandidateSelection.java b/core/src/main/java/com/bakdata/deduplication/candidate_selection/online/OnlineCandidateSelection.java index b01ed53..cf698e9 100644 --- a/core/src/main/java/com/bakdata/deduplication/candidate_selection/online/OnlineCandidateSelection.java +++ b/core/src/main/java/com/bakdata/deduplication/candidate_selection/online/OnlineCandidateSelection.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,12 +20,10 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.candidate_selection.online; import com.bakdata.deduplication.candidate_selection.Candidate; - import java.util.List; @FunctionalInterface diff --git a/core/src/main/java/com/bakdata/deduplication/classifier/Classification.java b/core/src/main/java/com/bakdata/deduplication/classifier/Classification.java index 6b0b895..7b01198 100644 --- a/core/src/main/java/com/bakdata/deduplication/classifier/Classification.java +++ b/core/src/main/java/com/bakdata/deduplication/classifier/Classification.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.classifier; diff --git a/core/src/main/java/com/bakdata/deduplication/classifier/ClassifiedCandidate.java b/core/src/main/java/com/bakdata/deduplication/classifier/ClassifiedCandidate.java index c7a0005..d7e554b 100644 --- a/core/src/main/java/com/bakdata/deduplication/classifier/ClassifiedCandidate.java +++ b/core/src/main/java/com/bakdata/deduplication/classifier/ClassifiedCandidate.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.classifier; @@ -31,4 +30,4 @@ public class ClassifiedCandidate { Candidate candidate; Classification classification; -} \ No newline at end of file +} diff --git a/core/src/main/java/com/bakdata/deduplication/classifier/Classifier.java b/core/src/main/java/com/bakdata/deduplication/classifier/Classifier.java index 92bbb4e..fdd6160 100644 --- a/core/src/main/java/com/bakdata/deduplication/classifier/Classifier.java +++ b/core/src/main/java/com/bakdata/deduplication/classifier/Classifier.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.classifier; @@ -29,4 +28,4 @@ @FunctionalInterface public interface Classifier { Classification classify(Candidate candidate); -} \ No newline at end of file +} diff --git a/core/src/main/java/com/bakdata/deduplication/clustering/Cluster.java b/core/src/main/java/com/bakdata/deduplication/clustering/Cluster.java index ab0555e..70923b1 100644 --- a/core/src/main/java/com/bakdata/deduplication/clustering/Cluster.java +++ b/core/src/main/java/com/bakdata/deduplication/clustering/Cluster.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.clustering; @@ -38,7 +37,7 @@ @Builder public class Cluster, T> { @SuppressWarnings("squid:S4276") - private static final Function, Integer> INT_GENERATOR = new Function<>() { + private static final Function INT_GENERATOR = new Function, Integer>() { private final AtomicInteger nextId = new AtomicInteger(); @Override @@ -47,7 +46,7 @@ public Integer apply(final Iterable objects) { } }; @SuppressWarnings("squid:S4276") - private static final Function, Long> LONG_GENERATOR = new Function<>() { + private static final Function LONG_GENERATOR = new Function, Long>() { private final AtomicLong nextId = new AtomicLong(); @Override @@ -64,7 +63,7 @@ public Cluster(final C id) { @SuppressWarnings("unchecked") public static Function, Integer> intGenerator() { - return (Function) INT_GENERATOR; + return INT_GENERATOR; } @SuppressWarnings("unchecked") @@ -88,7 +87,8 @@ public boolean contains(final T record) { return this.elements.contains(record); } - public Cluster merge(final Function, ? extends C> idGenerator, final Cluster other) { + public Cluster merge(final Function, ? extends C> idGenerator, + final Cluster other) { if (other == this) { return this; } diff --git a/core/src/main/java/com/bakdata/deduplication/clustering/ClusterSplitHandler.java b/core/src/main/java/com/bakdata/deduplication/clustering/ClusterSplitHandler.java index 6e3e40b..be26a63 100644 --- a/core/src/main/java/com/bakdata/deduplication/clustering/ClusterSplitHandler.java +++ b/core/src/main/java/com/bakdata/deduplication/clustering/ClusterSplitHandler.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.clustering; diff --git a/core/src/main/java/com/bakdata/deduplication/clustering/Clustering.java b/core/src/main/java/com/bakdata/deduplication/clustering/Clustering.java index ef7a117..2ac8375 100644 --- a/core/src/main/java/com/bakdata/deduplication/clustering/Clustering.java +++ b/core/src/main/java/com/bakdata/deduplication/clustering/Clustering.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,12 +20,10 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.clustering; import com.bakdata.deduplication.classifier.ClassifiedCandidate; - import java.util.List; import java.util.function.Function; diff --git a/core/src/main/java/com/bakdata/deduplication/deduplication/HardFusionHandler.java b/core/src/main/java/com/bakdata/deduplication/deduplication/HardFusionHandler.java index fbe3fdf..d19702f 100644 --- a/core/src/main/java/com/bakdata/deduplication/deduplication/HardFusionHandler.java +++ b/core/src/main/java/com/bakdata/deduplication/deduplication/HardFusionHandler.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,12 +20,10 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.deduplication; import com.bakdata.deduplication.fusion.FusedValue; - import java.util.Optional; import java.util.function.Function; diff --git a/core/src/main/java/com/bakdata/deduplication/deduplication/online/OnlineDeduplication.java b/core/src/main/java/com/bakdata/deduplication/deduplication/online/OnlineDeduplication.java index 953af0b..87da29d 100644 --- a/core/src/main/java/com/bakdata/deduplication/deduplication/online/OnlineDeduplication.java +++ b/core/src/main/java/com/bakdata/deduplication/deduplication/online/OnlineDeduplication.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.deduplication.online; diff --git a/core/src/main/java/com/bakdata/deduplication/deduplication/online/OnlinePairBasedDeduplication.java b/core/src/main/java/com/bakdata/deduplication/deduplication/online/OnlinePairBasedDeduplication.java index e9d6549..75e7279 100644 --- a/core/src/main/java/com/bakdata/deduplication/deduplication/online/OnlinePairBasedDeduplication.java +++ b/core/src/main/java/com/bakdata/deduplication/deduplication/online/OnlinePairBasedDeduplication.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.deduplication.online; @@ -51,14 +50,14 @@ public T deduplicate(final T newRecord) { } final List> mainClusters = - clusters.stream().filter(c -> c.contains(newRecord)).collect(Collectors.toList()); + clusters.stream().filter(c -> c.contains(newRecord)).collect(Collectors.toList()); if (mainClusters.size() != 1) { throw new IllegalStateException( - "Expected exactly one cluster with the new record, but received " + clusters); + "Expected exactly one cluster with the new record, but received " + clusters); } return Optional.of(this.fusion.fuse(mainClusters.get(0))) - .flatMap(this.hardFusionHandler::handlePartiallyFusedValue) + .flatMap(this.hardFusionHandler::handlePartiallyFusedValue) .map(FusedValue::getValue) .orElse(newRecord); } diff --git a/core/src/main/java/com/bakdata/deduplication/duplicate_detection/HardPairHandler.java b/core/src/main/java/com/bakdata/deduplication/duplicate_detection/HardPairHandler.java index bc94181..f7157cd 100644 --- a/core/src/main/java/com/bakdata/deduplication/duplicate_detection/HardPairHandler.java +++ b/core/src/main/java/com/bakdata/deduplication/duplicate_detection/HardPairHandler.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,12 +20,10 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.duplicate_detection; import com.bakdata.deduplication.classifier.ClassifiedCandidate; - import java.util.Optional; import java.util.function.Function; diff --git a/core/src/main/java/com/bakdata/deduplication/duplicate_detection/online/OnlineDuplicateDetection.java b/core/src/main/java/com/bakdata/deduplication/duplicate_detection/online/OnlineDuplicateDetection.java index 8e65039..d79900a 100644 --- a/core/src/main/java/com/bakdata/deduplication/duplicate_detection/online/OnlineDuplicateDetection.java +++ b/core/src/main/java/com/bakdata/deduplication/duplicate_detection/online/OnlineDuplicateDetection.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,12 +20,10 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.duplicate_detection.online; import com.bakdata.deduplication.clustering.Cluster; - import java.util.List; @FunctionalInterface diff --git a/core/src/main/java/com/bakdata/deduplication/duplicate_detection/online/OnlinePairBasedDuplicateDetection.java b/core/src/main/java/com/bakdata/deduplication/duplicate_detection/online/OnlinePairBasedDuplicateDetection.java index c9a4a33..4be5029 100644 --- a/core/src/main/java/com/bakdata/deduplication/duplicate_detection/online/OnlinePairBasedDuplicateDetection.java +++ b/core/src/main/java/com/bakdata/deduplication/duplicate_detection/online/OnlinePairBasedDuplicateDetection.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.duplicate_detection.online; @@ -58,8 +57,9 @@ public List> detectDuplicates(final T newRecord) { .collect(Collectors.toList()); final var handledPairs = classified.stream() - .flatMap(cc -> cc.getClassification().getResult() == Classification.ClassificationResult.POSSIBLE_DUPLICATE ? - this.hardPairHandler.apply(cc).stream() : + .flatMap(cc -> cc.getClassification().getResult() + == Classification.ClassificationResult.POSSIBLE_DUPLICATE ? + this.hardPairHandler.apply(cc).stream() : Stream.of(cc)) .collect(Collectors.toList()); diff --git a/core/src/main/java/com/bakdata/deduplication/fusion/FusedValue.java b/core/src/main/java/com/bakdata/deduplication/fusion/FusedValue.java index 488b743..14f7486 100644 --- a/core/src/main/java/com/bakdata/deduplication/fusion/FusedValue.java +++ b/core/src/main/java/com/bakdata/deduplication/fusion/FusedValue.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,14 +20,12 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.fusion; import com.bakdata.deduplication.clustering.Cluster; -import lombok.Value; - import java.util.List; +import lombok.Value; @Value public class FusedValue { diff --git a/core/src/main/java/com/bakdata/deduplication/fusion/Fusion.java b/core/src/main/java/com/bakdata/deduplication/fusion/Fusion.java index 21b120b..8408a67 100644 --- a/core/src/main/java/com/bakdata/deduplication/fusion/Fusion.java +++ b/core/src/main/java/com/bakdata/deduplication/fusion/Fusion.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.fusion; diff --git a/core/src/main/java/com/bakdata/deduplication/fusion/FusionException.java b/core/src/main/java/com/bakdata/deduplication/fusion/FusionException.java index 05209c6..c518ffd 100644 --- a/core/src/main/java/com/bakdata/deduplication/fusion/FusionException.java +++ b/core/src/main/java/com/bakdata/deduplication/fusion/FusionException.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.fusion; @@ -41,7 +40,7 @@ public FusionException(final Throwable cause) { } public FusionException(final String message, final Throwable cause, final boolean enableSuppression, - final boolean writableStackTrace) { + final boolean writableStackTrace) { super(message, cause, enableSuppression, writableStackTrace); } } diff --git a/core/src/main/java/com/bakdata/deduplication/similarity/CutoffSimiliarityMeasure.java b/core/src/main/java/com/bakdata/deduplication/similarity/CutoffSimiliarityMeasure.java new file mode 100644 index 0000000..5a32769 --- /dev/null +++ b/core/src/main/java/com/bakdata/deduplication/similarity/CutoffSimiliarityMeasure.java @@ -0,0 +1,50 @@ +/* + * MIT License + * + * Copyright (c) 2019 bakdata GmbH + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.bakdata.deduplication.similarity; + +import lombok.Value; + +@Value +public class CutoffSimiliarityMeasure implements SimilarityMeasure { + SimilarityMeasure inner; + float threshold; + + protected static float cutoff(final float similarity, final float min) { + return similarity < min ? 0 : similarity; + } + + @Override + public float getSimilarity(final T left, final T right, final SimilarityContext context) { + return cutoff(this.inner.getSimilarity(left, right, context), this.threshold); + } + + @Override + public SimilarityMeasure cutoff(final float threshold) { + if (threshold < this.threshold) { + return this; + } + return new com.bakdata.deduplication.similarity.CutoffSimiliarityMeasure<>(this.inner, threshold); + } +} diff --git a/core/src/main/java/com/bakdata/deduplication/similarity/SimilarityContext.java b/core/src/main/java/com/bakdata/deduplication/similarity/SimilarityContext.java index bec8071..a237a5b 100644 --- a/core/src/main/java/com/bakdata/deduplication/similarity/SimilarityContext.java +++ b/core/src/main/java/com/bakdata/deduplication/similarity/SimilarityContext.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.similarity; diff --git a/core/src/main/java/com/bakdata/deduplication/similarity/SimilarityException.java b/core/src/main/java/com/bakdata/deduplication/similarity/SimilarityException.java index e662caf..62628dc 100644 --- a/core/src/main/java/com/bakdata/deduplication/similarity/SimilarityException.java +++ b/core/src/main/java/com/bakdata/deduplication/similarity/SimilarityException.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.similarity; @@ -41,7 +40,7 @@ public SimilarityException(final Throwable cause) { } public SimilarityException(final String message, final Throwable cause, final boolean enableSuppression, - final boolean writableStackTrace) { + final boolean writableStackTrace) { super(message, cause, enableSuppression, writableStackTrace); } } diff --git a/core/src/main/java/com/bakdata/deduplication/similarity/SimilarityMeasure.java b/core/src/main/java/com/bakdata/deduplication/similarity/SimilarityMeasure.java index 24f5b6c..2510f6f 100644 --- a/core/src/main/java/com/bakdata/deduplication/similarity/SimilarityMeasure.java +++ b/core/src/main/java/com/bakdata/deduplication/similarity/SimilarityMeasure.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,13 +20,11 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.similarity; import java.util.function.Function; import java.util.function.Predicate; -import lombok.Value; @FunctionalInterface public interface SimilarityMeasure { @@ -40,7 +38,7 @@ static boolean isUnknown(final float value) { } static float scaleWithThreshold(final float similarity, final float min) { - if(similarity >= min) { + if (similarity >= min) { return (similarity - min) / (1 - min); } return -(min - similarity) / min; @@ -82,26 +80,4 @@ default SimilarityMeasure unknownIf(final Predicate scorePredicate) { }; } - @Value - class CutoffSimiliarityMeasure implements SimilarityMeasure { - SimilarityMeasure inner; - float threshold; - - protected static float cutoff(final float similarity, final float min) { - return similarity < min ? 0 : similarity; - } - - @Override - public float getSimilarity(final T left, final T right, final SimilarityContext context) { - return cutoff(this.inner.getSimilarity(left, right, context), this.threshold); - } - - @Override - public SimilarityMeasure cutoff(final float threshold) { - if (threshold < this.threshold) { - return this; - } - return new CutoffSimiliarityMeasure<>(this.inner, threshold); - } - } } diff --git a/core/src/main/java/com/bakdata/deduplication/similarity/SimilarityPath.java b/core/src/main/java/com/bakdata/deduplication/similarity/SimilarityPath.java index 2c9deb2..59d0b9d 100644 --- a/core/src/main/java/com/bakdata/deduplication/similarity/SimilarityPath.java +++ b/core/src/main/java/com/bakdata/deduplication/similarity/SimilarityPath.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.similarity; diff --git a/core/src/main/java/com/bakdata/deduplication/similarity/SimilarityTransformation.java b/core/src/main/java/com/bakdata/deduplication/similarity/SimilarityTransformation.java index 26898c4..196ee42 100644 --- a/core/src/main/java/com/bakdata/deduplication/similarity/SimilarityTransformation.java +++ b/core/src/main/java/com/bakdata/deduplication/similarity/SimilarityTransformation.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.similarity; @@ -35,4 +34,4 @@ default SimilarityTransformation andThen(final SimilarityTransformatio final var thisTransformation = this; return (t, context) -> after.transform(thisTransformation.transform(t, context), context); } -} \ No newline at end of file +} diff --git a/core/src/main/java/com/bakdata/util/FunctionalClass.java b/core/src/main/java/com/bakdata/util/FunctionalClass.java index f5cefd6..e5bd936 100644 --- a/core/src/main/java/com/bakdata/util/FunctionalClass.java +++ b/core/src/main/java/com/bakdata/util/FunctionalClass.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,15 +20,9 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.util; -import lombok.AccessLevel; -import lombok.NonNull; -import lombok.RequiredArgsConstructor; -import lombok.Value; - import java.beans.IntrospectionException; import java.beans.PropertyDescriptor; import java.lang.reflect.Constructor; @@ -36,6 +30,10 @@ import java.util.function.BiConsumer; import java.util.function.Function; import java.util.function.Supplier; +import lombok.AccessLevel; +import lombok.NonNull; +import lombok.RequiredArgsConstructor; +import lombok.Value; @Value @RequiredArgsConstructor(access = AccessLevel.PRIVATE) diff --git a/core/src/main/java/com/bakdata/util/FunctionalConstructor.java b/core/src/main/java/com/bakdata/util/FunctionalConstructor.java index 8a8daf3..98c8fef 100644 --- a/core/src/main/java/com/bakdata/util/FunctionalConstructor.java +++ b/core/src/main/java/com/bakdata/util/FunctionalConstructor.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,17 +20,15 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.util; +import java.lang.reflect.Constructor; +import java.lang.reflect.InvocationTargetException; import lombok.NonNull; import lombok.SneakyThrows; import lombok.Value; -import java.lang.reflect.Constructor; -import java.lang.reflect.InvocationTargetException; - @Value public class FunctionalConstructor { diff --git a/core/src/main/java/com/bakdata/util/FunctionalMethod.java b/core/src/main/java/com/bakdata/util/FunctionalMethod.java index 8b70732..aa1f090 100644 --- a/core/src/main/java/com/bakdata/util/FunctionalMethod.java +++ b/core/src/main/java/com/bakdata/util/FunctionalMethod.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,17 +20,15 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.util; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; import lombok.NonNull; import lombok.SneakyThrows; import lombok.Value; -import java.lang.reflect.InvocationTargetException; -import java.lang.reflect.Method; - @Value public class FunctionalMethod { diff --git a/core/src/test/java/com/bakdata/util/FunctionalClassTest.java b/core/src/test/java/com/bakdata/util/FunctionalClassTest.java index fe03b73..7616dad 100644 --- a/core/src/test/java/com/bakdata/util/FunctionalClassTest.java +++ b/core/src/test/java/com/bakdata/util/FunctionalClassTest.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,24 +20,22 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.util; -import com.bakdata.util.FunctionalClass.Field; -import lombok.Data; -import lombok.Getter; -import lombok.Setter; -import org.junit.jupiter.api.Test; +import static com.bakdata.util.FunctionalClass.from; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatExceptionOfType; +import com.bakdata.util.FunctionalClass.Field; import java.beans.IntrospectionException; import java.util.function.BiConsumer; import java.util.function.Function; import java.util.function.Supplier; - -import static com.bakdata.util.FunctionalClass.from; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatExceptionOfType; +import lombok.Data; +import lombok.Getter; +import lombok.Setter; +import org.junit.jupiter.api.Test; class FunctionalClassTest { @@ -126,7 +124,7 @@ void testGetMissingSetter() { class PersonWithoutSetter { @Getter - private String id = null; + private final String id = null; } assertThatExceptionOfType(RuntimeException.class) @@ -196,4 +194,4 @@ private static class Person { private String id; } -} \ No newline at end of file +} diff --git a/examples/build.gradle.kts b/examples/build.gradle.kts index 953c6a3..3bb9c3a 100644 --- a/examples/build.gradle.kts +++ b/examples/build.gradle.kts @@ -4,4 +4,4 @@ dependencies { implementation(project(":common")) testImplementation(group = "org.apache.commons", name = "commons-csv", version = "1.6") -} \ No newline at end of file +} diff --git a/examples/src/main/java/com/bakdata/deduplication/person/Gender.java b/examples/src/main/java/com/bakdata/deduplication/person/Gender.java index 41f51dd..3c2ddf6 100644 --- a/examples/src/main/java/com/bakdata/deduplication/person/Gender.java +++ b/examples/src/main/java/com/bakdata/deduplication/person/Gender.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.person; diff --git a/examples/src/main/java/com/bakdata/deduplication/person/Person.java b/examples/src/main/java/com/bakdata/deduplication/person/Person.java index 2c1065c..b8d9afb 100644 --- a/examples/src/main/java/com/bakdata/deduplication/person/Person.java +++ b/examples/src/main/java/com/bakdata/deduplication/person/Person.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,19 +20,17 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.person; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; - import java.time.LocalDate; import java.time.LocalDateTime; import java.util.HashSet; import java.util.Set; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; @Data @NoArgsConstructor diff --git a/examples/src/main/java/com/bakdata/deduplication/person/PersonCandidateSelection.java b/examples/src/main/java/com/bakdata/deduplication/person/PersonCandidateSelection.java index caa00af..49790b6 100644 --- a/examples/src/main/java/com/bakdata/deduplication/person/PersonCandidateSelection.java +++ b/examples/src/main/java/com/bakdata/deduplication/person/PersonCandidateSelection.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.person; @@ -59,7 +58,7 @@ private static String normalize(final String value) { // remove everything in braces // remove all non-alphanumericals final String nonBraced = - BRACED_TERMS.matcher(Normalizer.normalize(value.toLowerCase(), Form.NFD)).replaceAll(""); + BRACED_TERMS.matcher(Normalizer.normalize(value.toLowerCase(), Form.NFD)).replaceAll(""); return NON_ALPHA.matcher(nonBraced).replaceAll(""); } } diff --git a/examples/src/main/java/com/bakdata/deduplication/person/PersonClassifier.java b/examples/src/main/java/com/bakdata/deduplication/person/PersonClassifier.java index 885d8c0..b331ac5 100644 --- a/examples/src/main/java/com/bakdata/deduplication/person/PersonClassifier.java +++ b/examples/src/main/java/com/bakdata/deduplication/person/PersonClassifier.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,20 +20,23 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.person; +import static com.bakdata.deduplication.similarity.CommonSimilarityMeasures.colognePhonetic; +import static com.bakdata.deduplication.similarity.CommonSimilarityMeasures.equality; +import static com.bakdata.deduplication.similarity.CommonSimilarityMeasures.jaroWinkler; +import static com.bakdata.deduplication.similarity.CommonSimilarityMeasures.levenshtein; +import static com.bakdata.deduplication.similarity.CommonSimilarityMeasures.max; +import static com.bakdata.deduplication.similarity.CommonSimilarityMeasures.maxDiff; + import com.bakdata.deduplication.classifier.Classifier; import com.bakdata.deduplication.classifier.RuleBasedClassifier; import com.bakdata.deduplication.similarity.CommonSimilarityMeasures; -import lombok.Value; -import lombok.experimental.Delegate; - import java.time.format.DateTimeFormatter; import java.time.temporal.ChronoUnit; - -import static com.bakdata.deduplication.similarity.CommonSimilarityMeasures.*; +import lombok.Value; +import lombok.experimental.Delegate; @Value public class PersonClassifier implements Classifier { @@ -43,9 +46,11 @@ public class PersonClassifier implements Classifier { Classifier classifier = RuleBasedClassifier.builder() .positiveRule("Basic comparison", CommonSimilarityMeasures.weightedAverage() .add(2, Person::getFirstName, max(levenshtein().cutoff(0.5f), jaroWinkler())) - .add(2, Person::getLastName, max(equality().of(colognePhonetic()), levenshtein().cutoff(0.5f), jaroWinkler())) + .add(2, Person::getLastName, + max(equality().of(colognePhonetic()), levenshtein().cutoff(0.5f), jaroWinkler())) .add(1, Person::getGender, equality()) - .add(2, Person::getBirthDate, max(levenshtein().of(ISO_FORMAT::format), maxDiff(2, ChronoUnit.DAYS))) + .add(2, Person::getBirthDate, + max(levenshtein().of(ISO_FORMAT::format), maxDiff(2, ChronoUnit.DAYS))) .build() .scaleWithThreshold(0.9f)) .build(); diff --git a/examples/src/main/java/com/bakdata/deduplication/person/PersonClustering.java b/examples/src/main/java/com/bakdata/deduplication/person/PersonClustering.java index 15cb898..9bd528c 100644 --- a/examples/src/main/java/com/bakdata/deduplication/person/PersonClustering.java +++ b/examples/src/main/java/com/bakdata/deduplication/person/PersonClustering.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,11 +20,14 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.person; -import com.bakdata.deduplication.clustering.*; +import com.bakdata.deduplication.clustering.Cluster; +import com.bakdata.deduplication.clustering.Clustering; +import com.bakdata.deduplication.clustering.ConsistentClustering; +import com.bakdata.deduplication.clustering.RefineCluster; +import com.bakdata.deduplication.clustering.RefinedTransitiveClosure; import lombok.Value; import lombok.experimental.Delegate; diff --git a/examples/src/main/java/com/bakdata/deduplication/person/PersonDeduplication.java b/examples/src/main/java/com/bakdata/deduplication/person/PersonDeduplication.java index e4e473f..ed70924 100644 --- a/examples/src/main/java/com/bakdata/deduplication/person/PersonDeduplication.java +++ b/examples/src/main/java/com/bakdata/deduplication/person/PersonDeduplication.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.person; @@ -37,9 +36,9 @@ public class PersonDeduplication implements OnlineDeduplication { OnlineDeduplication deduplication; public PersonDeduplication(final HardPairHandler hardPairHandler, - final HardFusionHandler hardFusionHandler) { + final HardFusionHandler hardFusionHandler) { this.deduplication = OnlinePairBasedDeduplication.builder() - .duplicateDetection(new PersonDuplicateDetection(hardPairHandler)) + .duplicateDetection(new PersonDuplicateDetection(hardPairHandler)) .fusion(new PersonFusion()) .hardFusionHandler(hardFusionHandler) .build(); diff --git a/examples/src/main/java/com/bakdata/deduplication/person/PersonDuplicateDetection.java b/examples/src/main/java/com/bakdata/deduplication/person/PersonDuplicateDetection.java index ee8fcd4..f4e111d 100644 --- a/examples/src/main/java/com/bakdata/deduplication/person/PersonDuplicateDetection.java +++ b/examples/src/main/java/com/bakdata/deduplication/person/PersonDuplicateDetection.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.person; @@ -36,11 +35,11 @@ public class PersonDuplicateDetection implements OnlineDuplicateDetection duplicateDetection; public PersonDuplicateDetection(final HardPairHandler hardPairHandler) { - duplicateDetection = OnlinePairBasedDuplicateDetection.builder() - .classifier(new PersonClassifier()) - .candidateSelection(new PersonCandidateSelection()) - .clustering(new PersonClustering()) - .hardPairHandler(hardPairHandler) - .build(); + this.duplicateDetection = OnlinePairBasedDuplicateDetection.builder() + .classifier(new PersonClassifier()) + .candidateSelection(new PersonCandidateSelection()) + .clustering(new PersonClustering()) + .hardPairHandler(hardPairHandler) + .build(); } } diff --git a/examples/src/main/java/com/bakdata/deduplication/person/PersonFusion.java b/examples/src/main/java/com/bakdata/deduplication/person/PersonFusion.java index 6573c9e..7e6d56e 100644 --- a/examples/src/main/java/com/bakdata/deduplication/person/PersonFusion.java +++ b/examples/src/main/java/com/bakdata/deduplication/person/PersonFusion.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,21 +20,25 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.person; +import static com.bakdata.deduplication.fusion.CommonConflictResolutions.assumeEqualValue; +import static com.bakdata.deduplication.fusion.CommonConflictResolutions.latest; +import static com.bakdata.deduplication.fusion.CommonConflictResolutions.longest; +import static com.bakdata.deduplication.fusion.CommonConflictResolutions.max; +import static com.bakdata.deduplication.fusion.CommonConflictResolutions.min; +import static com.bakdata.deduplication.fusion.CommonConflictResolutions.union; +import static com.bakdata.deduplication.fusion.CommonConflictResolutions.vote; + import com.bakdata.deduplication.fusion.ConflictResolution; import com.bakdata.deduplication.fusion.ConflictResolutionFusion; import com.bakdata.deduplication.fusion.ConflictResolutions; import com.bakdata.deduplication.fusion.Fusion; +import java.util.Set; import lombok.Value; import lombok.experimental.Delegate; -import java.util.Set; - -import static com.bakdata.deduplication.fusion.CommonConflictResolutions.*; - @Value public class PersonFusion implements Fusion { ConflictResolution personMerge = ConflictResolutions.merge(Person::new) diff --git a/examples/src/test/java/com/bakdata/deduplication/person/PersonDeduplicationTest.java b/examples/src/test/java/com/bakdata/deduplication/person/PersonDeduplicationTest.java index 7d96d6e..83f7d14 100644 --- a/examples/src/test/java/com/bakdata/deduplication/person/PersonDeduplicationTest.java +++ b/examples/src/test/java/com/bakdata/deduplication/person/PersonDeduplicationTest.java @@ -1,7 +1,7 @@ /* - * The MIT License + * MIT License * - * Copyright (c) 2018 bakdata GmbH + * Copyright (c) 2019 bakdata GmbH * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,7 +20,6 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. - * */ package com.bakdata.deduplication.person; @@ -48,7 +47,8 @@ class PersonDeduplicationTest { private static List parseCsv(final String resourceName) throws IOException { final CSVFormat format = CSVFormat.newFormat('\t').withFirstRecordAsHeader().withQuote('"'); try (final var parser = CSVParser - .parse(PersonDeduplicationTest.class.getResourceAsStream(resourceName), StandardCharsets.UTF_8, format)) { + .parse(PersonDeduplicationTest.class.getResourceAsStream(resourceName), StandardCharsets.UTF_8, + format)) { return parser.getRecords() .stream() .map(record -> Person.builder() @@ -68,16 +68,16 @@ void testDeduplication() throws IOException { final PersonDeduplication deduplication = new PersonDeduplication(HardPairHandler.ignore(), Optional::of); // no fusion on the non-duplicated customers - for (final Person customer : PersonDeduplicationTest.parseCsv("/customer.csv")) { + for (final Person customer : parseCsv("/customer.csv")) { final Person fusedPerson = deduplication.deduplicate(customer); assertSame(customer, fusedPerson); } - for (final Person customer : PersonDeduplicationTest.parseCsv("/exact_duplicates.csv")) { + for (final Person customer : parseCsv("/exact_duplicates.csv")) { final Person fusedPerson = deduplication.deduplicate(customer); assertNotSame(customer, fusedPerson); // should be the same except for fusion id assertEquals(customer, fusedPerson.toBuilder().fusedIds(Set.of()).build()); } } -} \ No newline at end of file +} diff --git a/settings.gradle b/settings.gradle index a1a8ea3..9a976d3 100644 --- a/settings.gradle +++ b/settings.gradle @@ -1,3 +1,3 @@ rootProject.name = 'dedupe' -include 'core', 'common', 'examples' \ No newline at end of file +include 'core', 'common', 'examples'