Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions doc/release-notes/12014-croissant-1.1.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
### Croissant 1.1 (Summary Statistics)

The Croissant metadata export format has been updated from version 1.0 to 1.1.

Summary statistics (mean, min, max, etc.) are now included for tabular files that were successfully ingested.

You can download an example Croissant file from the [Supported Metadata Export Formats](https://dataverse-guide--12214.org.readthedocs.build/en/12214/user/dataset-management.html#supported-metadata-export-formats) section of the guides.

Minor backward-incompatible changes were made, which are noted below.

See #12014 and #12214

## Backward Incompatible Changes

Generally speaking, see the [API Changelog](https://guides.dataverse.org/en/latest/api/changelog.html) for a list of backward-incompatible API changes.

Minor changes in the `croissant` format are noted in the [API changelog](https://dataverse-guide--12214.org.readthedocs.build/en/12214/api/changelog.html).

## Upgrade Instructions

1. Re-export metadata export formats

We re-export because the Croissant format was updated.

`curl http://localhost:8080/api/admin/metadata/reExportAll`
5 changes: 5 additions & 0 deletions doc/sphinx-guides/source/api/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ This API changelog is experimental and we would love feedback on its usefulness.
:local:
:depth: 1

v6.11
-----

- The Croissant :ref:`metadata export format <metadata-export-formats>` has been updated from version 1.0 to 1.1, which is reflected in the ``conformsTo`` property. ``@vocab`` and ``sc`` properties now use "http" as `recommended <https://github.com/mlcommons/croissant/pull/929#pullrequestreview-3079137662>`_. The unused ``wd`` property has been dropped.

v6.9
----

Expand Down
2 changes: 1 addition & 1 deletion doc/sphinx-guides/source/user/dataset-management.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ Supported Metadata Export Formats

Once a dataset has been published, its metadata can be exported in a variety of other metadata standards and formats, which help make datasets more :doc:`discoverable </admin/discoverability>` and usable in other systems, such as other data repositories. On each dataset page's metadata tab, the following exports are available:

- Croissant
- Croissant (example: :download:`max-croissant.json <../../../../src/test/resources/croissant/max/expected/max-croissant.json>`)
- Dublin Core
- DDI (Data Documentation Initiative Codebook 2.5)
- DDI HTML Codebook (A more human-readable, HTML version of the DDI Codebook 2.5 metadata export)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ public static void exportDataset(
{
"@context": {
"@language": "en",
"@vocab": "https://schema.org/",
"@vocab": "http://schema.org/",
"citeAs": "cr:citeAs",
"column": "cr:column",
"conformsTo": "dct:conformsTo",
Expand All @@ -46,6 +46,7 @@ public static void exportDataset(
"@type": "@vocab"
},
"dct": "http://purl.org/dc/terms/",
"ddi-stats": "http://rdf-vocabulary.ddialliance.org/cv/SummaryStatisticType/2.1.2/",
"examples": {
"@id": "cr:examples",
"@type": "@json"
Expand All @@ -69,12 +70,11 @@ public static void exportDataset(
"repeated": "cr:repeated",
"replace": "cr:replace",
"samplingRate": "cr:samplingRate",
"sc": "https://schema.org/",
"sc": "http://schema.org/",
"separator": "cr:separator",
"source": "cr:source",
"subField": "cr:subField",
"transform": "cr:transform",
"wd": "https://www.wikidata.org/wiki/"
"transform": "cr:transform"
}
}
""";
Expand All @@ -84,7 +84,7 @@ public static void exportDataset(
}

job.add("@type", "sc:Dataset");
job.add("conformsTo", "http://mlcommons.org/croissant/1.0");
job.add("conformsTo", "http://mlcommons.org/croissant/1.1");

JsonObject datasetJson = dataProvider.getDatasetJson();

Expand Down Expand Up @@ -261,6 +261,13 @@ public static void exportDataset(
int varQuantity = dataTableObject.getInt("varQuantity");
// Unused
int caseQuantity = dataTableObject.getInt("caseQuantity");
recordSetContent.add(
"cr:annotation",
Json.createObjectBuilder()
.add("@type", "cr:Field")
.add("name", fileId.toString() + "/count")
.add("value", caseQuantity)
.add("dataType", "http://www.wikidata.org/entity/Q4049983"));
JsonArray dataVariables = dataTableObject.getJsonArray("dataVariables");
JsonArrayBuilder fieldSetArray = Json.createArrayBuilder();
for (JsonValue dataVariableValue : dataVariables) {
Expand All @@ -278,6 +285,8 @@ public static void exportDataset(
dataVariableObject.getString("variableFormatType");
String variableIntervalType =
dataVariableObject.getString("variableIntervalType");
JsonObject variableSummaryStatistics =
dataVariableObject.getJsonObject("summaryStatistics");
String dataType = null;
/**
* There are only two variableFormatType types on the Dataverse side:
Expand All @@ -293,7 +302,129 @@ public static void exportDataset(
default:
break;
}
fieldSetArray.add(
JsonArrayBuilder annotationsBuilder = Json.createArrayBuilder();
if (variableSummaryStatistics != null) {
// Same order as upstream: MEAN, MEDN, MODE, MIN, MAX, STDEV, VALD, INVD
annotationsBuilder
.add(
Json.createObjectBuilder()
// We're aware that an @id of
// "data/stata13-auto.dta/price/mean"
// looks nice but won't validate if there's
// whitespace in the filename.
// We've asked for guidance here:
// https://github.com/mlcommons/croissant/issues/639#issuecomment-3792179493
.add(
"@id",
fileId.toString()
+ "/"
+ variableName
// The spec gives "mean" as an
// example but we'll use
// ArithmeticMean from
// https://rdf-vocabulary.ddialliance.org/ddi-cv/SummaryStatisticType/2.1.2/SummaryStatisticType.html
+ "/ArithmeticMean")
.add(
"value",
variableSummaryStatistics.getString(
"mean"))
.add("dataType", "ddi-stats:7975ed0"))
.add(
Json.createObjectBuilder()
.add(
"@id",
fileId.toString()
+ "/"
+ variableName
+ "/Median")
.add(
"value",
variableSummaryStatistics.getString(
"medn"))
.add("dataType", "ddi-stats:66851a3")
.add("equivalentProperty", "sc:median"))
.add(
Json.createObjectBuilder()
.add(
"@id",
fileId.toString()
+ "/"
+ variableName
+ "/Mode")
.add(
"value",
variableSummaryStatistics.getString(
"mode"))
.add("dataType", "ddi-stats:650be61"))
.add(
Json.createObjectBuilder()
.add(
"@id",
fileId.toString()
+ "/"
+ variableName
+ "/Minimum")
.add(
"value",
variableSummaryStatistics.getString(
"min"))
.add("dataType", "ddi-stats:a1d0ec6")
.add("equivalentProperty", "sc:minValue"))
.add(
Json.createObjectBuilder()
.add(
"@id",
fileId.toString()
+ "/"
+ variableName
+ "/Maximum")
.add(
"value",
variableSummaryStatistics.getString(
"max"))
.add("dataType", "ddi-stats:8321e79")
.add("equivalentProperty", "sc:maxValue"))
.add(
Json.createObjectBuilder()
.add(
"@id",
fileId.toString()
+ "/"
+ variableName
+ "/StandardDeviation")
.add(
"value",
variableSummaryStatistics.getString(
"stdev"))
.add("dataType", "ddi-stats:690ab50"))
.add(
Json.createObjectBuilder()
.add(
"@id",
fileId.toString()
+ "/"
+ variableName
+ "/ValidCases")
.add(
"value",
variableSummaryStatistics.getString(
"vald"))
.add("dataType", "ddi-stats:c646dd8"))
.add(
Json.createObjectBuilder()
.add(
"@id",
fileId.toString()
+ "/"
+ variableName
+ "/InvalidCases")
.add(
"value",
variableSummaryStatistics.getString(
"invd"))
.add("dataType", "ddi-stats:6459c62"));
}
JsonObjectBuilder fieldBuilder =
Json.createObjectBuilder()
.add("@type", "cr:Field")
.add("name", variableName)
Expand All @@ -312,7 +443,12 @@ public static void exportDataset(
Json.createObjectBuilder()
.add(
"column",
variableName))));
variableName)));
JsonArray annotations = annotationsBuilder.build();
if (!annotations.isEmpty()) {
fieldBuilder.add("annotation", annotations);
}
fieldSetArray.add(fieldBuilder);
}
recordSetContent.add("field", fieldSetArray);
recordSet.add(recordSetContent);
Expand Down
Loading
Loading