Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IQSS/10379 Fix Metrics API Bugs #10865

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions doc/release-notes/10379-MetricsBugsFixes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@

### Metrics API Bug fixes

Two bugs in the Metrics API have been fixed:

- The /datasets and /datasets/byMonth endpoints could report incorrect values if/when they have been called using the dataLocation parameter (which allows getting metrics for local, remote (harvested), or all datasets) as the metrics cache was not storing different values for these cases.

- Metrics endpoints who's calculation relied on finding the latest published datasetversion were incorrect if/when the minor version number was > 9.

When deploying the new release, the [/api/admin/clearMetricsCache](https://guides.dataverse.org/en/latest/api/native-api.html#metrics) API should be called to remove old cached values that may be incorrect.
7 changes: 4 additions & 3 deletions src/main/java/edu/harvard/iq/dataverse/api/Metrics.java
Original file line number Diff line number Diff line change
Expand Up @@ -206,12 +206,13 @@ public Response getDatasetsTimeSeriest(@Context Request req, @Context UriInfo ur
return error(BAD_REQUEST, ia.getLocalizedMessage());
}
String metricName = "datasets";
JsonArray jsonArray = MetricsUtil.stringToJsonArray(metricsSvc.returnUnexpiredCacheAllTime(metricName, null, d));
String validDataLocation = MetricsUtil.validateDataLocationStringType(dataLocation);
JsonArray jsonArray = MetricsUtil.stringToJsonArray(metricsSvc.returnUnexpiredCacheAllTime(metricName, validDataLocation, d));

if (null == jsonArray) { // run query and save

jsonArray = metricsSvc.getDatasetsTimeSeries(uriInfo, dataLocation, d);
metricsSvc.save(new Metric(metricName, null, null, d, jsonArray.toString()));
jsonArray = metricsSvc.getDatasetsTimeSeries(uriInfo, validDataLocation, d);
metricsSvc.save(new Metric(metricName, null, validDataLocation, d, jsonArray.toString()));
}
MediaType requestedType = getVariant(req, MediaType.valueOf(FileUtil.MIME_TYPE_CSV), MediaType.APPLICATION_JSON_TYPE);
if ((requestedType != null) && (requestedType.equals(MediaType.APPLICATION_JSON_TYPE))) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,33 +168,20 @@ public long datasetsToMonth(String yyyymm, String dataLocation, Dataverse d) {
}
}

// Note that this SQL line in the code below:
// datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber))
// behaves somewhat counter-intuitively if the versionnumber and/or
// minorversionnumber is/are NULL - it results in an empty string
// (NOT the string "{dataset_id}:", in other words). Some harvested
// versions do not have version numbers (only the ones harvested from
// other Dataverses!) It works fine
// for our purposes below, because we are simply counting the selected
// lines - i.e. we don't care if some of these lines are empty.
// But do not use this notation if you need the values returned to
// meaningfully identify the datasets!


Query query = em.createNativeQuery(


"select count(*)\n"
+ "from (\n"
+ "select datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber))\n"
+ "select DISTINCT ON (datasetversion.dataset_id) datasetversion.dataset_id \n"
+ "from datasetversion\n"
+ "join dataset on dataset.id = datasetversion.dataset_id\n"
+ ((d == null) ? "" : "join dvobject on dvobject.id = dataset.id\n")
+ "where versionstate='RELEASED' \n"
+ ((d == null) ? "" : "and dvobject.owner_id in (" + getCommaSeparatedIdStringForSubtree(d, "Dataverse") + ")\n ")
+ "and \n"
+ dataLocationLine // be careful about adding more and statements after this line.
+ "group by dataset_id \n"
+ " order by datasetversion.dataset_id, datasetversion.versionnumber desc, datasetversion.minorversionnumber desc\n"
+") sub_temp"
);
logger.log(Level.FINE, "Metric query: {0}", query);
Expand All @@ -207,15 +194,15 @@ public List<Object[]> datasetsBySubjectToMonth(String yyyymm, String dataLocatio
// A published local datasets may have more than one released version!
// So that's why we have to jump through some extra hoops below
// in order to select the latest one:
String originClause = "(datasetversion.dataset_id || ':' || datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber) in\n" +
String originClause = "(datasetversion.id in\n" +
"(\n" +
"select datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber))\n" +
"select DISTINCT ON (datasetversion.dataset_id) datasetversion.id\n" +
" from datasetversion\n" +
" join dataset on dataset.id = datasetversion.dataset_id\n" +
" where versionstate='RELEASED'\n" +
" and dataset.harvestingclient_id is null\n" +
" and date_trunc('month', releasetime) <= to_date('" + yyyymm + "','YYYY-MM')\n" +
" group by dataset_id\n" +
" order by datasetversion.dataset_id, datasetversion.versionnumber desc, datasetversion.minorversionnumber desc\n" +
"))\n";

if (!DATA_LOCATION_LOCAL.equals(dataLocation)) { // Default api state is DATA_LOCATION_LOCAL
Expand Down Expand Up @@ -273,15 +260,15 @@ public long datasetsPastDays(int days, String dataLocation, Dataverse d) {
Query query = em.createNativeQuery(
"select count(*)\n"
+ "from (\n"
+ "select datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber)) as max\n"
+ "select DISTINCT ON (datasetversion.dataset_id) datasetversion.id\n"
+ "from datasetversion\n"
+ "join dataset on dataset.id = datasetversion.dataset_id\n"
+ ((d == null) ? "" : "join dvobject on dvobject.id = dataset.id\n")
+ "where versionstate='RELEASED' \n"
+ ((d == null) ? "" : "and dvobject.owner_id in (" + getCommaSeparatedIdStringForSubtree(d, "Dataverse") + ")\n")
+ "and \n"
+ dataLocationLine // be careful about adding more and statements after this line.
+ "group by dataset_id \n"
+ " order by datasetversion.dataset_id, datasetversion.versionnumber desc, datasetversion.minorversionnumber desc \n"
+") sub_temp"
);
logger.log(Level.FINE, "Metric query: {0}", query);
Expand Down Expand Up @@ -322,17 +309,17 @@ public long filesToMonth(String yyyymm, Dataverse d) {
+ "select count(*)\n"
+ "from filemetadata\n"
+ "join datasetversion on datasetversion.id = filemetadata.datasetversion_id\n"
+ "where datasetversion.dataset_id || ':' || datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber) in \n"
+ "where datasetversion.id in \n"
+ "(\n"
+ "select datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber)) as max \n"
+ "select DISTINCT ON (datasetversion.dataset_id) datasetversion.id \n"
+ "from datasetversion\n"
+ "join dataset on dataset.id = datasetversion.dataset_id\n"
+ ((d == null) ? "" : "join dvobject on dvobject.id = dataset.id\n")
+ "where versionstate='RELEASED'\n"
+ ((d == null) ? "" : "and dvobject.owner_id in (" + getCommaSeparatedIdStringForSubtree(d, "Dataverse") + ")\n")
+ "and date_trunc('month', releasetime) <= to_date('" + yyyymm + "','YYYY-MM')\n"
+ "and dataset.harvestingclient_id is null\n"
+ "group by dataset_id \n"
+ "order by datasetversion.dataset_id, datasetversion.versionnumber desc, datasetversion.minorversionnumber \n"
+ ");"
);
logger.log(Level.FINE, "Metric query: {0}", query);
Expand All @@ -345,17 +332,17 @@ public long filesPastDays(int days, Dataverse d) {
+ "select count(*)\n"
+ "from filemetadata\n"
+ "join datasetversion on datasetversion.id = filemetadata.datasetversion_id\n"
+ "where datasetversion.dataset_id || ':' || datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber) in \n"
+ "where datasetversion.id in \n"
+ "(\n"
+ "select datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber)) as max \n"
+ "select DISTINCT ON (datasetversion.dataset_id) datasetversion.id \n"
+ "from datasetversion\n"
+ "join dataset on dataset.id = datasetversion.dataset_id\n"
+ ((d == null) ? "" : "join dvobject on dvobject.id = dataset.id\n")
+ "where versionstate='RELEASED'\n"
+ "and releasetime > current_date - interval '" + days + "' day\n"
+ ((d == null) ? "" : "AND dvobject.owner_id in (" + getCommaSeparatedIdStringForSubtree(d, "Dataverse") + ")\n")
+ "and dataset.harvestingclient_id is null\n"
+ "group by dataset_id \n"
+ "order by datasetversion.dataset_id, datasetversion.versionnumber desc, datasetversion.minorversionnumber desc \n"
+ ");"
);
logger.log(Level.FINE, "Metric query: {0}", query);
Expand Down
Loading