Skip to content

Commit

Permalink
[04b6IYYB] Fix sampling in apoc.meta procedures (#3440)
Browse files Browse the repository at this point in the history
  • Loading branch information
gem-neo4j authored Feb 2, 2023
1 parent 5a1ff56 commit 27b56d6
Show file tree
Hide file tree
Showing 4 changed files with 243 additions and 72 deletions.
96 changes: 57 additions & 39 deletions core/src/main/java/apoc/meta/Meta.java
Original file line number Diff line number Diff line change
Expand Up @@ -416,10 +416,10 @@ public long count(@Name(value = "nodes", defaultValue = "[]") List<String> nodes
? StreamSupport.stream(subGraph.getAllLabelsInUse().spliterator(), false)
: nodes.stream().filter(Objects::nonNull).map(String::trim).map(Label::label);

final boolean isIncludeRels = CollectionUtils.isEmpty(conf.getIncludesRels());
final boolean isIncludeRels = CollectionUtils.isEmpty(conf.getIncludeRels());
Set<Long> visitedNodes = new HashSet<>();
return labels
.flatMap(label -> isIncludeRels ? Stream.of(subGraph.countsForNode(label)) : conf.getIncludesRels()
.flatMap(label -> isIncludeRels ? Stream.of(subGraph.countsForNode(label)) : conf.getIncludeRels()
.stream()
.filter(Objects::nonNull)
.map(String::trim)
Expand Down Expand Up @@ -544,23 +544,23 @@ public Stream<MetaResult> dataOf(@Name(value = "graph") Object graph, @Name(valu
} else {
throw new IllegalArgumentException("Supported inputs are String, VirtualGraph, Map");
}
return collectMetaData(subGraph, metaConfig).values().stream().flatMap(x -> x.values().stream());
return collectMetaData(subGraph, metaConfig.getSampleMetaConfig()).values().stream().flatMap(x -> x.values().stream());
}

// todo ask index for distinct values if index size < 10 or so
// todo put index sizes for indexed properties
@Procedure
@Description("apoc.meta.data({config}) - examines a subset of the graph to provide a tabular meta information")
public Stream<MetaResult> data(@Name(value = "config",defaultValue = "{}") Map<String,Object> config) {
MetaConfig metaConfig = new MetaConfig(config);
SampleMetaConfig metaConfig = new SampleMetaConfig(config);
return collectMetaData(new DatabaseSubGraph(transaction), metaConfig).values().stream().flatMap(x -> x.values().stream());
}

@Procedure
@Description("apoc.meta.schema({config}) - examines a subset of the graph to provide a map-like meta information")
public Stream<MapResult> schema(@Name(value = "config",defaultValue = "{}") Map<String,Object> config) {
MetaStats metaStats = collectStats();
MetaConfig metaConfig = new MetaConfig(config);
SampleMetaConfig metaConfig = new SampleMetaConfig(config);
Map<Set<String>, Map<String, MetaItem>> metaData = collectMetaData(new DatabaseSubGraph(transaction), metaConfig);

Map<String, Object> relationships = collectRelationshipsMetaData(metaStats, metaData);
Expand Down Expand Up @@ -646,12 +646,12 @@ private Tables4LabelsProfile collectTables4LabelsProfile (MetaConfig config) {
}
}

Map<String, Long> countStore = getLabelCountStore();
Map<String, Long> countStore = getLabelCountStore(transaction, kernelTx);

Set<String> includeLabels = config.getIncludesLabels();
Set<String> excludes = config.getExcludes();
Set<String> includeLabels = config.getIncludeLabels();
Set<String> excludes = config.getExcludeLabels();

Set<String> includeRels = config.getIncludesRels();
Set<String> includeRels = config.getIncludeRels();
Set<String> excludeRels = config.getExcludeRels();

for (Label label : tx.getAllLabelsInUse()) {
Expand Down Expand Up @@ -695,7 +695,7 @@ private Tables4LabelsProfile collectTables4LabelsProfile (MetaConfig config) {

// End new code

private Map<Set<String>, Map<String, MetaItem>> collectMetaData(SubGraph graph, MetaConfig config) {
private Map<Set<String>, Map<String, MetaItem>> collectMetaData(SubGraph graph, SampleMetaConfig config) {
Map<Set<String>, Map<String, MetaItem>> metaData = new LinkedHashMap<>(100);

Set<RelationshipType> types = Iterables.asSet(graph.getAllRelationshipTypesInUse());
Expand Down Expand Up @@ -735,15 +735,15 @@ private Set<String> getIndexedProperties(Iterable<IndexDefinition> indexes) {
.collect(Collectors.toSet());
}

private Map<String, Long> getLabelCountStore() {
private static Map<String, Long> getLabelCountStore(Transaction tx, KernelTransaction kernelTx) {
List<String> labels = Iterables.stream(tx.getAllLabelsInUse()).map( Label::name ).collect( Collectors.toList());
TokenRead tokenRead = kernelTx.tokenRead();
return labels
.stream()
.collect(Collectors.toMap(e -> e, e -> kernelTx.dataRead().countsForNodeWithoutTxState(tokenRead.nodeLabel(e))));
}

public long getSampleForLabelCount(long labelCount, long sample) {
public static long getSampleForLabelCount(long labelCount, long sample) {
if(sample != -1L) {
long skipCount = labelCount / sample;
long min = (long) Math.floor(skipCount - (skipCount * 0.1D));
Expand Down Expand Up @@ -1041,14 +1041,14 @@ public RelationshipType relationshipType() {
@Procedure
@Description("apoc.meta.graph - examines the full graph to create the meta-graph")
public Stream<GraphResult> graph(@Name(value = "config",defaultValue = "{}") Map<String,Object> config) {
MetaConfig metaConfig = new MetaConfig(config);
SampleMetaConfig metaConfig = new SampleMetaConfig(config, false);
return metaGraph(new DatabaseSubGraph(transaction), null, null, true, metaConfig);
}

@Procedure("apoc.meta.graph.of")
@Description("apoc.meta.graph.of({graph}, {config}) - examines a subset of the graph to provide a graph meta information")
public Stream<GraphResult> graphOf(@Name(value = "graph",defaultValue = "{}") Object graph, @Name(value = "config",defaultValue = "{}") Map<String,Object> config) {
MetaConfig metaConfig = new MetaConfig(config);
MetaConfig metaConfig = new MetaConfig(config, false);
final SubGraph subGraph;
if (graph instanceof String) {
Result result = tx.execute((String) graph);
Expand All @@ -1066,12 +1066,10 @@ public Stream<GraphResult> graphOf(@Name(value = "graph",defaultValue = "{}") Ob
} else {
throw new IllegalArgumentException("Supported inputs are String, VirtualGraph, Map");
}
return metaGraph(subGraph,null, null, true, metaConfig);
return metaGraph(subGraph,null, null, true, metaConfig.getSampleMetaConfig());
}

private Stream<GraphResult> metaGraph(SubGraph subGraph, Collection<String> labelNames, Collection<String> relTypeNames, boolean removeMissing, MetaConfig metaConfig) {
TokenRead tokenRead = kernelTx.tokenRead();

private Stream<GraphResult> metaGraph(SubGraph subGraph, Collection<String> labelNames, Collection<String> relTypeNames, boolean removeMissing, SampleMetaConfig metaConfig) {
Iterable<RelationshipType> types = subGraph.relTypesInUse(relTypeNames);
Iterable<Label> labels = CollectionUtils.isNotEmpty(labelNames)
? labelNames.stream().map(Label::label).collect(Collectors.toList()) : subGraph.getAllLabelsInUse();
Expand Down Expand Up @@ -1114,7 +1112,7 @@ private Stream<GraphResult> metaGraph(SubGraph subGraph, Collection<String> labe
return Stream.of(graphResult);
}

private void filterNonExistingRelationships(Map<Pattern, Relationship> vRels, MetaConfig metaConfig) {
private void filterNonExistingRelationships(Map<Pattern, Relationship> vRels, SampleMetaConfig metaConfig) {
Set<Pattern> rels = vRels.keySet();
Map<Pair<String,String>,Set<Pattern>> aggregated = new HashMap<>();
for (Pattern rel : rels) {
Expand All @@ -1124,43 +1122,57 @@ private void filterNonExistingRelationships(Map<Pattern, Relationship> vRels, Me
aggregated.values().stream()
.filter( c -> c.size() > 1)
.flatMap(Collection::stream)
.filter( p -> !relationshipExists(p, vRels.get(p), metaConfig))
.filter( p -> !relationshipExistsWithDegreeCheck(p, vRels.get(p), metaConfig))
.forEach(vRels::remove);
}

private boolean relationshipExists(Pattern p, Relationship relationship, MetaConfig metaConfig) {
if (relationship==null) return false;
private boolean relationshipExistsWithDegreeCheck(Pattern p, Relationship relationship, SampleMetaConfig metaConfig) {
if (relationship == null) return false;
double degreeFrom = (double)(long)relationship.getProperty("out") / (long)relationship.getStartNode().getProperty("count");
double degreeTo = (double)(long)relationship.getProperty("in") / (long)relationship.getEndNode().getProperty("count");

if (degreeFrom < degreeTo) {
if (relationshipExists(p.labelFrom(), p.labelTo(), p.relationshipType(), Direction.OUTGOING, metaConfig)) return true;
return (relationshipExists(tx, p.labelFrom(), p.labelTo(), p.relationshipType(), Direction.OUTGOING, metaConfig));
} else {
if (relationshipExists(p.labelTo(), p.labelFrom(), p.relationshipType(), Direction.INCOMING, metaConfig)) return true;
return (relationshipExists(tx, p.labelTo(), p.labelFrom(), p.relationshipType(), Direction.INCOMING, metaConfig));
}
return false;
}

private boolean relationshipExists(Label labelFromLabel, Label labelToLabel, RelationshipType relationshipType, Direction direction, MetaConfig metaConfig) {
Map<String, Long> countStore = getLabelCountStore();
/**
* relationshipExists uses sampling to check if the relationships added in previous steps exist.
* The sample count is the skip count; e.g. if set to 1000 this means every 1000th node will be checked.
* A high sample count means that only one node will be checked each time.
* Note; Each node is still fetched, but the relationships on that node will not be checked
* if skipped, which should make it faster.
*/
static boolean relationshipExists(
Transaction tx,
Label labelFromLabel,
Label labelToLabel,
RelationshipType relationshipType,
Direction direction,
SampleMetaConfig metaConfig
) {
try (ResourceIterator<Node> nodes = tx.findNodes(labelFromLabel)) {
long count = 1L;
String labelName = labelFromLabel.name();
long labelCount = countStore.get(labelName);
long sample = getSampleForLabelCount(labelCount, metaConfig.getSample());
long count = 0L;
// A sample size below or equal to 0 means we should check every node.
long skipCount = metaConfig.getSample() > 0 ? metaConfig.getSample() : 1;
while (nodes.hasNext()) {
count++;
Node node = nodes.next();
if(count % sample == 0) {
if (count % skipCount == 0) {
long maxRels = metaConfig.getMaxRels();
for (Relationship rel : node.getRelationships(direction, relationshipType)) {
Node otherNode = direction == Direction.OUTGOING ? rel.getEndNode() : rel.getStartNode();
// We have found the rel, we are confident the relationship exists.
if (otherNode.hasLabel(labelToLabel)) return true;
if (maxRels != -1 && maxRels-- == 0) break;
}
}
count++;
}
}
// Our sampling (or full scan if skipCount == 1) did not find the relationship
// So we assume it doesn't exist and remove it from the schema, may result in false negatives!
return false;
}

Expand All @@ -1173,17 +1185,23 @@ private void combine(Map<Pair<String, String>, Set<Pattern>> aggregated, Pair<St
@Procedure
@Description("apoc.meta.graphSample() - examines the database statistics to build the meta graph, very fast, might report extra relationships")
public Stream<GraphResult> graphSample(@Name(value = "config",defaultValue = "{}") Map<String,Object> config) {
MetaConfig metaConfig = new MetaConfig(config);
return metaGraph(new DatabaseSubGraph(transaction), null, null, false, metaConfig);
return metaGraph(new DatabaseSubGraph(transaction), null, null, false, new SampleMetaConfig(null));
}

@Procedure
@Description("apoc.meta.subGraph({labels:[labels],rels:[rel-types], excludes:[labels,rel-types]}) - examines a sample sub graph to create the meta-graph")
public Stream<GraphResult> subGraph(@Name("config") Map<String,Object> config ) {

MetaConfig metaConfig = new MetaConfig(config);

return filterResultStream(metaConfig.getExcludes(), metaGraph(new DatabaseSubGraph(transaction), metaConfig.getIncludesLabels(), metaConfig.getIncludesRels(),true, metaConfig));
MetaConfig metaConfig = new MetaConfig(config, false);
return filterResultStream(
metaConfig.getExcludeLabels(),
metaGraph(
new DatabaseSubGraph(transaction),
metaConfig.getIncludeLabels(),
metaConfig.getIncludeRels(),
true,
metaConfig.getSampleMetaConfig()
)
);
}

private Stream<GraphResult> filterResultStream(Set<String> excludes, Stream<GraphResult> graphResultStream) {
Expand Down
75 changes: 42 additions & 33 deletions core/src/main/java/apoc/meta/MetaConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,36 @@

public class MetaConfig {

private final Set<String> includesLabels;
private final Set<String> includesRels;
private final Set<String> excludes;
private final Set<String> includeLabels;
private final Set<String> includeRels;
private final Set<String> excludeLabels;
private final Set<String> excludeRels;
private final long maxRels;
private final long sample;
private final boolean addRelationshipsBetweenNodes;

private final SampleMetaConfig sampleMetaConfig;

/**
* A map of values, with the following keys and meanings.
* - labels: a list of strings, which are whitelisted node labels. If this list
* - includeLabels: a list of strings, which are allowlisted node labels. If this list
* is specified **only these labels** will be examined.
* - rels: a list of strings, which are whitelisted rel types. If this list is
* - includeRels: a list of strings, which are allowlisted rel types. If this list is
* specified, **only these reltypes** will be examined.
* - excludes: a list of strings, which are node labels. This
* works like a blacklist: if listed here, the thing won't be considered. Everything
* else (subject to the whitelist) will be.
* - excludeLabels: a list of strings, which are node labels. This
* works like a denylist: if listed here, the thing won't be considered. Everything
* else (subject to the allowlist) will be.
* - excludeRels: a list of strings, which are relationship types. This
* works like a denylist: if listed here, the thing won't be considered. Everything
* else (subject to the allowlist) will be.
* - sample: a long number, i.e. "1 in (SAMPLE)". If set to 1000 this means that
* every 1000th node will be examined. It does **not** mean that a total of 1000 nodes
* will be sampled.
* - maxRels: the maximum number of relationships of a given type to look at.
* @param config
* - maxRels: the maximum number of relationships to look at per Node Label.
*/

public MetaConfig(Map<String,Object> config) {
public MetaConfig(Map<String,Object> config, Boolean shouldSampleByDefault) {
config = config != null ? config : Collections.emptyMap();

// To maintain backwards compatibility, need to still support "labels", "rels" and "excludes" for "includeLabels", "includeRels" and "excludeLabels" respectively.
// TODO: Remove in 6.0: To maintain backwards compatibility until then we still need to support;
// "labels", "rels" and "excludes" for "includeLabels", "includeRels" and "excludeLabels" respectively.

Set<String> includesLabelsLocal = new HashSet<>((Collection<String>)config.getOrDefault("labels",Collections.EMPTY_SET));
Set<String> includesRelsLocal = new HashSet<>((Collection<String>)config.getOrDefault("rels",Collections.EMPTY_SET));
Expand All @@ -53,48 +55,55 @@ public MetaConfig(Map<String,Object> config) {
excludesLocal = new HashSet<>((Collection<String>)config.getOrDefault("excludeLabels",Collections.EMPTY_SET));
}

this.includesLabels = includesLabelsLocal;
this.includesRels = includesRelsLocal;
this.excludes = excludesLocal;
this.includeLabels = includesLabelsLocal;
this.includeRels = includesRelsLocal;
this.excludeLabels = excludesLocal;
this.excludeRels = new HashSet<>((Collection<String>)config.getOrDefault("excludeRels",Collections.EMPTY_SET));
this.sample = (long) config.getOrDefault("sample", 1000L);
this.maxRels = (long) config.getOrDefault("maxRels", 100L);
this.sampleMetaConfig = new SampleMetaConfig(config, shouldSampleByDefault);
this.addRelationshipsBetweenNodes = Util.toBoolean(config.getOrDefault("addRelationshipsBetweenNodes", true));
}

public MetaConfig(Map<String,Object> config) {
this(config, true);
}


public Set<String> getIncludesLabels() {
return includesLabels;
public Set<String> getIncludeLabels() {
return includeLabels;
}

public Set<String> getIncludesRels() {
return includesRels;
public Set<String> getIncludeRels() {
return includeRels;
}

public Set<String> getExcludes() {
return excludes;
public Set<String> getExcludeLabels() {
return excludeLabels;
}

public Set<String> getExcludeRels() {
return excludeRels;
}

public long getSample() {
return sample;
return sampleMetaConfig.getSample();
}

public long getMaxRels() {
return maxRels;
return sampleMetaConfig.getMaxRels();
}

public SampleMetaConfig getSampleMetaConfig() {
return sampleMetaConfig;
}

/**
* @param l
* @return true if the label matches the mask expressed by this object, false otherwise.
*/
public boolean matches(Label l) {
if (getExcludes().contains(l.name())) { return false; }
if (getIncludesLabels().isEmpty()) { return true; }
return getIncludesLabels().contains(l.name());
if (getExcludeLabels().contains(l.name())) { return false; }
if (getIncludeLabels().isEmpty()) { return true; }
return getIncludeLabels().contains(l.name());
}

/**
Expand Down Expand Up @@ -128,8 +137,8 @@ public boolean matches(RelationshipType rt) {
String name = rt.name();

if (getExcludeRels().contains(name)) { return false; }
if (getIncludesRels().isEmpty()) { return true; }
return getIncludesRels().contains(name);
if (getIncludeRels().isEmpty()) { return true; }
return getIncludeRels().contains(name);
}

public boolean isAddRelationshipsBetweenNodes() {
Expand Down
Loading

0 comments on commit 27b56d6

Please sign in to comment.