diff --git a/README.md b/README.md index cd7bb85..549cd29 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,35 @@ During development `lein run` can be used instead of building the uberjar: The server hosts a GraphQL endpoint at http://localhost:PORT/graphql which follows the protocol described [here](http://graphql.org/learn/serving-over-http/). +## Data validation + +CubiQL currently makes assumptions about the data in the cubes it finds, namely that certain resources (e.g. dimensions and measures) have +an associated label, and that time and geograpical dimension values have a particular structure. When producing your own data for use +with CubiQL you may want to check that it conforms to the expectations CubiQL has. + +The [rdf-validator](https://github.com/Swirrl/rdf-validator) is a tool for running a collection of validation tests against a SPARQL endpoint. +CubiQL includes validation queries encoding its requirements in the `validation` directory. To run these validations against your data: + +1. Download the [latest version](https://github.com/Swirrl/rdf-validator/releases) of RDF validator +2. Clone the CubiQL repository or copy the files in the validation directory to your local machine +3. Define the CubiQL configuration file for your data. The required configuration keys are listed below +4. Run the RDF validator by specifying the location of the data, validation directory and CubiQL configuration e.g. + + java -jar rdf-validator-standalone.jar --endpoint my_data.ttl --suite validations/ --variables cubiql-config.edn + + The `--endpoint` parameter can refer to an RDF file, a folder containing RDF files or a remote SPARQL endpoint URI. + +The `cubiql-config.edn` file must contain the following keys: + + | Key | + |---------------------| + | :geo-dimension-uri | + | :time-dimension-uri | + | :codelist-label-uri | + | :dataset-label-uri | + +If you are not using time or geography dimensions in your runtime configuration, you should set `geo-dimension-uri` and/or `time-dimension-uri` to a dummy value. + ## License Copyright © 2017 Swirrl IT Ltd. diff --git a/validation/codelist_members_must_have_labels.sparql b/validation/codelist_members_must_have_labels.sparql new file mode 100644 index 0000000..113c29d --- /dev/null +++ b/validation/codelist_members_must_have_labels.sparql @@ -0,0 +1,11 @@ +PREFIX qb: +PREFIX rdfs: +PREFIX skos: + +SELECT ?this WHERE { + { ?codelist skos:member ?this } + UNION { ?this skos:inScheme ?codelist } + FILTER NOT EXISTS { + ?this <{{codelist-label-uri}}> ?label . + } +} diff --git a/validation/datasets_must_have_labels.sparql b/validation/datasets_must_have_labels.sparql new file mode 100644 index 0000000..f202460 --- /dev/null +++ b/validation/datasets_must_have_labels.sparql @@ -0,0 +1,11 @@ +# Datasets must have an associated label + +PREFIX rdfs: +PREFIX qb: + +SELECT ?this WHERE { + ?this a qb:DataSet . + FILTER NOT EXISTS { + ?this <{{dataset-label-uri}}> ?label . + } +} diff --git a/validation/datasets_must_have_measure_type_component.sparql b/validation/datasets_must_have_measure_type_component.sparql new file mode 100644 index 0000000..7940145 --- /dev/null +++ b/validation/datasets_must_have_measure_type_component.sparql @@ -0,0 +1,12 @@ +# Datasets must have a qb:measureType component + +PREFIX qb: + +SELECT ?this WHERE { + ?this a qb:DataSet . + FILTER NOT EXISTS { + ?this qb:structure ?dsd . + ?dsd qb:component ?comp . + ?comp qb:dimension qb:measureType . + } +} diff --git a/validation/dimensions_codelist_must_have_only_codes_used.sparql b/validation/dimensions_codelist_must_have_only_codes_used.sparql new file mode 100644 index 0000000..ad2c0e0 --- /dev/null +++ b/validation/dimensions_codelist_must_have_only_codes_used.sparql @@ -0,0 +1,24 @@ +PREFIX qb: +PREFIX skos: + +Select distinct ?v where{ + {SELECT ?v WHERE { + ?q a qb:DataSet. + ?q qb:structure/qb:component ?comp. + ?comp qb:dimension|qb:attribute ?dim . + ?comp <{{codelist-predicate}}> ?list . + ?obs ?dim ?v. + FILTER NOT EXISTS { { ?list skos:member ?v } UNION { ?v skos:inScheme ?list } } + }} UNION + {SELECT distinct ?v WHERE { + ?q a qb:DataSet. + ?q qb:structure/qb:component ?comp. + ?comp qb:dimension|qb:attribute ?dim . + ?comp <{{codelist-predicate}}> ?list . + {?list skos:member ?v } UNION { ?v skos:inScheme ?list } + FILTER NOT EXISTS {?obs qb:dataSet ?q. ?obs ?dim ?v}}} +} + +# The codelist of each dimension should contain only the codes used at the cube +# Check 1) if all codes used at the cube exist at the codelist and +# 2)all codes of the codelist appear at the cube diff --git a/validation/dimensions_must_have_labels.sparql b/validation/dimensions_must_have_labels.sparql new file mode 100644 index 0000000..647b82d --- /dev/null +++ b/validation/dimensions_must_have_labels.sparql @@ -0,0 +1,11 @@ +# All dimension properties must have an associated label + +PREFIX rdfs: +PREFIX qb: + +SELECT ?this WHERE { + ?this a qb:DimensionProperty . + FILTER NOT EXISTS { + ?this <{{dataset-label-uri}}> ?label . + } +} diff --git a/validation/geo_values_must_have_labels.sparql b/validation/geo_values_must_have_labels.sparql new file mode 100644 index 0000000..f252602 --- /dev/null +++ b/validation/geo_values_must_have_labels.sparql @@ -0,0 +1,16 @@ +PREFIX qb: + +SELECT ?this WHERE { + { SELECT DISTINCT ?this WHERE { + ?obs a qb:Observation . + ?obs <{{geo-dimension-uri}}> ?this . + } + } + + FILTER NOT EXISTS { + ?this <{{dataset-label-uri}}> ?label . + } +} + +# Finds all geographic dimension values which do not have a corresponding label + diff --git a/validation/measures_must_have_labels.sparql b/validation/measures_must_have_labels.sparql new file mode 100644 index 0000000..8a27b46 --- /dev/null +++ b/validation/measures_must_have_labels.sparql @@ -0,0 +1,11 @@ +# All measure properties should have a label + +PREFIX rdfs: +PREFIX qb: + +SELECT ?this WHERE { + ?this a qb:MeasureProperty . + FILTER NOT EXISTS { + ?this <{{dataset-label-uri}}> ?label . + } +} diff --git a/validation/time_values_must_have_beginning_time.sparql b/validation/time_values_must_have_beginning_time.sparql new file mode 100644 index 0000000..6cf5a49 --- /dev/null +++ b/validation/time_values_must_have_beginning_time.sparql @@ -0,0 +1,16 @@ +PREFIX qb: +PREFIX time: +PREFIX xsd: + +SELECT ?this WHERE { + { SELECT DISTINCT ?this WHERE { + ?obs a qb:Observation . + ?obs <{{time-dimension-uri}}> ?this . + } + } + FILTER NOT EXISTS { + ?this time:hasBeginning ?begin . + ?begin time:inXSDDateTime ?begintime . + FILTER(datatype(?begintime) = xsd:dateTime) + } +} \ No newline at end of file diff --git a/validation/time_values_must_have_end_time.sparql b/validation/time_values_must_have_end_time.sparql new file mode 100644 index 0000000..8328398 --- /dev/null +++ b/validation/time_values_must_have_end_time.sparql @@ -0,0 +1,16 @@ +PREFIX qb: +PREFIX time: +PREFIX xsd: + +SELECT ?this WHERE { + { SELECT DISTINCT ?this WHERE { + ?obs a qb:Observation . + ?obs <{{time-dimension-uri}}> ?this . + } + } + FILTER NOT EXISTS { + ?this time:hasEnd ?end . + ?end time:inXSDDateTime ?endtime . + FILTER(datatype(?endtime) = xsd:dateTime) + } +} \ No newline at end of file