Skip to content

Commit

Permalink
feat(rule): NON_UNIQUE_ENTITY_VALUE rule implementation
Browse files Browse the repository at this point in the history
Closes #197
  • Loading branch information
buchslava committed Sep 5, 2016
1 parent 0bae531 commit 0007482
Show file tree
Hide file tree
Showing 11 changed files with 376 additions and 1 deletion.
92 changes: 92 additions & 0 deletions doc/rules/NON_UNIQUE_ENTITY_VALUE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# NON_UNIQUE_ENTITY_VALUE

## Rule test folder

`test/fixtures/rules-cases/non-unique-entity-value`

## Description
An issue according to this rule will be fired when id value entity (under particular kind of entity, geo-country, for example) is not unique.

## Examples of correct data

`ddf--entities--geo--income_groups.csv`
```
income_groups,name,gwid,is--income_groups
high_income,High income,i268,TRUE
lower_middle_income,Lower middle income,i269,TRUE
low_income,Low income,i266,TRUE
upper_middle_income,Upper middle income,i267,TRUE
```

## Examples of incorrect data

`ddf--entities--geo--income_groups.csv`
```
income_groups,name,gwid,is--income_groups
high_income,High income,i268,TRUE
high_income,High income,i268,TRUE
lower_middle_income,Lower middle income,i269,TRUE
low_income,Low income,i266,TRUE
upper_middle_income,Upper middle income,i267,TRUE
```

## Output data format

* `value` - duplicated entity id value

## Extra information

### @jheeffer

ddf--entities--geo--country.csv
```
geo name is--country
swe Sweden 1
```

ddf--entities--geo--un_state.csv
```
geo name un_membership_year is--un_state
swe Sweden 1946 1
```

The above is valid but should give a warning because `geo.name` of `swe` is defined twice, but is equal so there's no error.
I am not sure about the warning here. On the one hand, it is good to try to limit the amount of redundant data in a dataset. Warnings could help with spotting these redundancies.
On the other hand, this could lead to maaaaany useless warnings. Maybe they should be per file: "`geo.name` is defined in both `ddf--entities--geo--country.csv` and `ddf--entities--geo--un_state.csv` and causes redundancy" instead of a per-entity warning. However, this is a more complex validation I think. It's possible for `geo.name` to be in multiple files without causing overlap/redundancy. Plus, duplicating `geo.name` over files could be useful for overview, so maybe a warning is not always in place? What do you think?

---------------

ddf--entities--geo--country.csv
```
geo name is--country
swe Sweden 1
```

ddf--entities--geo--un_state.csv
```
geo name un_membership_year is--un_state
swe Kingdom of Sweden 1946 1
```

The above is invalid and should throw an error because `geo.name` for `swe` has two different values and thus there's a conflict for the value of `geo.name` for `swe`.

### @jheeffer

Also I'm fine with error'ing on duplicate ID in one file, as under your first case.

ddf--entities--geo--country.csv
```
geo name is--country
swe Sweden 1
ukr Ukraine 1
swe Sweden 1
```
Invalid because of duplicate `swe` entity in one file, even though properties are all the same.

### @buchslava

yes I understand this idea: I'll get keys intersection for two records, for example,

for `name is--country` and `name un_membership_year is--un_state` intersection will be `name`

and after I'll analyze values for those fields (`name`). if they are equal - ok, else - error
2 changes: 1 addition & 1 deletion lib/ddf-definitions/concept.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class Concept {
const result = {};

this.collection
.find({concept: {$in: concepts}})
.find(concepts ? {concept: {$in: concepts}} : {})
.forEach(concept => {
result[concept.concept] = concept[field];
});
Expand Down
101 changes: 101 additions & 0 deletions lib/ddf-rules/entity-rules.js
Original file line number Diff line number Diff line change
Expand Up @@ -98,5 +98,106 @@ module.exports = {
});

return result;
},
[registry.NON_UNIQUE_ENTITY_VALUE]: ddfDataSet => {
const conceptTypeHash = ddfDataSet.getConcept().getDictionary(null, 'concept_type');
const domainTypeHash = ddfDataSet.getConcept().getDictionary(null, 'domain');
const entities = ddfDataSet.getEntity().getDataByFiles();

function getEntitiesFilesDescriptor() {
const entitiesFilesDescriptor = {};

Object.keys(entities).forEach(entityFile => {
const firstRecord = _.head(entities[entityFile]);

entitiesFilesDescriptor[entityFile] = {};
entitiesFilesDescriptor[entityFile].entityIdField = _.head(
Object.keys(firstRecord)
.filter(
key =>
conceptTypeHash[key] === 'entity_domain' || conceptTypeHash[key] === 'entity_set'
)
);
entitiesFilesDescriptor[entityFile].entityDomain = entitiesFilesDescriptor[entityFile].entityIdField;

if (conceptTypeHash[entitiesFilesDescriptor[entityFile].entityIdField] !== 'entity_domain') {
entitiesFilesDescriptor[entityFile].entityDomain =
domainTypeHash[entitiesFilesDescriptor[entityFile].entityIdField];
}
});

return entitiesFilesDescriptor;
}


function checkDuplicationIssue(record1, record2) {
const relatedKeys1 = Object.keys(record1).filter(key => !_.startsWith(key, '$'));
const relatedKeys2 = Object.keys(record2).filter(key => !_.startsWith(key, '$'));
const sameKeys = _.intersection(relatedKeys1, relatedKeys2);

let result = false;

for (const key of sameKeys) {
if (!_.isEqual(record1[key], record2[key])) {
result = true;
break;
}
}

return result;
}

function getDuplicatesHash(entitiesFilesDescriptor) {
const duplicatesHash = {};

Object.keys(entities).forEach(file => {
const entityDescriptor = entitiesFilesDescriptor[file];

entities[file].forEach(entityRecord => {
if (!duplicatesHash[entityDescriptor.entityDomain]) {
duplicatesHash[entityDescriptor.entityDomain] = {};
}

if (!duplicatesHash[entityDescriptor.entityDomain][entityRecord[entityDescriptor.entityIdField]]) {
duplicatesHash[entityDescriptor.entityDomain][entityRecord[entityDescriptor.entityIdField]] = [];
}

duplicatesHash[entityDescriptor.entityDomain][entityRecord[entityDescriptor.entityIdField]]
.push(entityRecord);
});
});

return duplicatesHash;
}

const duplicatesHash = getDuplicatesHash(getEntitiesFilesDescriptor());
const issues = [];

Object.keys(duplicatesHash).forEach(entityDomain => {
Object.keys(duplicatesHash[entityDomain]).forEach(entityId => {
if (duplicatesHash[entityDomain][entityId].length > 1) {
const processed = [];

duplicatesHash[entityDomain][entityId].forEach(record1 => {
duplicatesHash[entityDomain][entityId].forEach(record2 => {
const isCheckingNotNeeded = _.includes(processed, record1) && _.includes(processed, record2);

if (record1 !== record2 && !isCheckingNotNeeded) {
if (checkDuplicationIssue(record1, record2)) {
const issue = new Issue(registry.NON_UNIQUE_ENTITY_VALUE)
.setPath(record1.$$source).setData({source: record1, duplicate: record2});

issues.push(issue);
}

processed.push(record1, record2);
}
});
});
}
});
});

return issues;
}
};
3 changes: 3 additions & 0 deletions lib/ddf-rules/registry.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ exports.DATA_POINT_UNEXPECTED_ENTITY_VALUE = Symbol.for('DATA_POINT_UNEXPECTED_E
exports.DATA_POINT_UNEXPECTED_TIME_VALUE = Symbol.for('DATA_POINT_UNEXPECTED_TIME_VALUE');
exports.WRONG_ENTITY_IS_HEADER = Symbol.for('WRONG_ENTITY_IS_HEADER');
exports.WRONG_ENTITY_IS_VALUE = Symbol.for('WRONG_ENTITY_IS_VALUE');
exports.NON_UNIQUE_ENTITY_VALUE = Symbol.for('NON_UNIQUE_ENTITY_VALUE');
exports.FILENAME_DOES_NOT_MATCH_HEADER = Symbol.for('FILENAME_DOES_NOT_MATCH_HEADER');
exports.CONCEPT_MANDATORY_FIELD_NOT_FOUND = Symbol.for('CONCEPT_MANDATORY_FIELD_NOT_FOUND');
exports.CONCEPTS_NOT_FOUND = Symbol.for('CONCEPTS_NOT_FOUND');
Expand Down Expand Up @@ -46,6 +47,7 @@ exports.tags = {
[exports.DATA_POINT_UNEXPECTED_TIME_VALUE]: [exports.WAFFLE_SERVER_TAG, exports.DATAPOINT_TAG],
[exports.WRONG_ENTITY_IS_HEADER]: [],
[exports.WRONG_ENTITY_IS_VALUE]: [],
[exports.NON_UNIQUE_ENTITY_VALUE]: [exports.WAFFLE_SERVER_TAG],
[exports.FILENAME_DOES_NOT_MATCH_HEADER]: [exports.WAFFLE_SERVER_TAG],
[exports.CONCEPT_MANDATORY_FIELD_NOT_FOUND]: [exports.WAFFLE_SERVER_TAG],
[exports.CONCEPTS_NOT_FOUND]: [exports.WAFFLE_SERVER_TAG],
Expand All @@ -69,6 +71,7 @@ exports.descriptions = {
[exports.DATA_POINT_UNEXPECTED_TIME_VALUE]: 'Unexpected time value in the data point',
[exports.WRONG_ENTITY_IS_HEADER]: 'Wrong "is" header',
[exports.WRONG_ENTITY_IS_VALUE]: 'Wrong value for "is" header',
[exports.NON_UNIQUE_ENTITY_VALUE]: 'Non unique entity value',
[exports.FILENAME_DOES_NOT_MATCH_HEADER]: 'Filename does not match to header of this file',
[exports.CONCEPT_MANDATORY_FIELD_NOT_FOUND]: 'Concept mandatory field is not found',
[exports.CONCEPTS_NOT_FOUND]: 'Concepts are not found',
Expand Down
62 changes: 62 additions & 0 deletions test/entry-rules.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -102,4 +102,66 @@ describe('rules for entry', () => {
});
});
});

describe('when "NON_UNIQUE_ENTITY_VALUE" rule', () => {
afterEach(done => {
ddfDataSet.dismiss(() => {
done();
});
});

it('any issue should NOT be found for folder without the problem (fixtures/good-folder)', done => {
ddfDataSet = new DdfDataSet('./test/fixtures/good-folder');
ddfDataSet.load(() => {
expect(entryRules[rulesRegistry.NON_UNIQUE_ENTITY_VALUE](ddfDataSet).length).to.equal(0);

done();
});
});

it(`issues should be found for folder with the problem
(fixtures/rules-cases/non-unique-entity-value)`, done => {
ddfDataSet = new DdfDataSet('./test/fixtures/rules-cases/non-unique-entity-value');
ddfDataSet.load(() => {
const result = entryRules[rulesRegistry.NON_UNIQUE_ENTITY_VALUE](ddfDataSet);
const issuesData = [
{
source: {
geo: 'vat',
name: 'Vatican2'
},
duplicate: {
geo: 'vat',
name: 'Vatican'
}
},
{
source: {
geo: 'afg',
name: 'Afghanistan'
},
duplicate: {
geo: 'afg',
name: 'Afghanistan2'
}
}
];

expect(result.length).to.equal(issuesData.length);

issuesData.forEach((issueData, index) => {
expect(result[index].type).to.equal(rulesRegistry.NON_UNIQUE_ENTITY_VALUE);
expect(!!result[index].data).to.be.true;
expect(!!result[index].data.source).to.be.true;
expect(!!result[index].data.duplicate).to.be.true;
expect(result[index].data.source.geo).to.equal(issueData.source.geo);
expect(result[index].data.source.name).to.equal(issueData.source.name);
expect(result[index].data.duplicate.geo).to.equal(issueData.duplicate.geo);
expect(result[index].data.duplicate.name).to.equal(issueData.duplicate.name);
});

done();
});
});
});
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
concept,concept_type,domain,name
lat,measure,,Latitude
lng,measure,,Longitude
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
concept,concept_type,domain,name,drill_up
name,string,,,
drill_up,string,,,
geo,entity_domain,,,
tag,entity_domain,,,
domain,string,,Domain,
region,entity_set,geo,Region,"[""country"",""capital""]"
country,entity_set,geo,Country,
capital,entity_set,geo,Capital,
pop,measure,geo,Population,
year,time,,year,
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
country,year,pop
vat,1960,100000
vat,1970,"1,100.196"
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
geo,name,lat,lng,is--region,is--country,is--capital
vat,Vatican2,,,0,1,1
and,Andorra,,,0,1,0
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
geo,name,lat,lng,is--region,is--country,is--capital
and,Andorra,,,0,1,0
afg,Afghanistan,,,0,1,0
afg,Afghanistan2,,,0,1,0
dza,Algeria,,,0,1,0
africa,Africa,,,1,0,0
europe,Europe,,,1,0,0
americas,Americas,,,1,0,0
asia,Asia,,,1,0,0
vat,Vatican,,,0,1,1
Loading

0 comments on commit 0007482

Please sign in to comment.