From: Pierre Choffet Date: Fri, 14 Jan 2022 21:49:55 +0000 (-0500) Subject: Add scripts to work with RDF X-Git-Url: https://git.choffet.net/?a=commitdiff_plain;h=743fe0cd13a8f742c8d873c7f91e379cb215b0d3;p=wdef_tools.git Add scripts to work with RDF We can now manage a local RDFs cache, and reduce a wdef element from its already known values in Wikidata. --- diff --git a/README b/README index ce966d1..2217d9f 100644 --- a/README +++ b/README @@ -19,12 +19,18 @@ Runtime dependencies are: Description of the provided tools: + - scripts/cache_rdf.sh + Ensure a Wikidata's RDF element is available in local cache, return its path. + - scripts/get_qid_from_property.sh Search Wikidata elements based on a value, return its QID when found. - xslts/canonicalize.xslt Return a wdef under its normal form. + - xslts/merge_rdf.xslt + Read a Wikidata RDF and remove already known values in a wdef file. + - xslts/remove_labels_descriptions.xslt Return a wdef with labels and description removed for a given element. diff --git a/scripts/get_merged_element.sh b/scripts/get_merged_element.sh new file mode 100755 index 0000000..c8b4596 --- /dev/null +++ b/scripts/get_merged_element.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +# get_merged_element.sh - Return an element in wdef:knowledge after it's been +# merged with a RDF. +# Copyright (C) 2022 Pierre Choffet +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of version 3 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +readonly SCRIPT_DIR="$(dirname "$0")" + +source "${SCRIPT_DIR}/rdf.sh" + +set -euo pipefail + +# Any rdf cache older than this (in minutes) will be updated +RDFS_MAX_AGE=${RDFS_MAX_AGE:=1440} + +readonly CANONICALIZE_XSLT_PATH="${SCRIPT_DIR}/../xslts/canonicalize.xslt" +readonly MERGE_XSLT_PATH="${SCRIPT_DIR}/../xslts/merge_rdf.xslt" + +function usage() { + cat << EOF +USAGE: get_merged_element.sh + +Merge a RDF into an element inside a wdef file. A wdef file containing this +single element is returned. +EOF +} + +if [ "$#" -ne 2 ] +then + usage >&2 + exit 1 +fi + +readonly WDEF_PATH="${1}" +readonly ELEMENT_QID="${2}" + +# Check wdef exists +if [ ! -s "${WDEF_PATH}" ] +then + echo "WDEF file doesn't exist. Exiting" >&2 + exit +fi + +# Export element from wdef +readonly ELEMENT_PATH="$(mktemp)" +xmlstarlet sel -D -t -e 'wdef:knowledge' -m '/wdef:knowledge' -c "wdef:element[@wdef:id = '${ELEMENT_QID}']" "${WDEF_PATH}" | xmlstarlet fo - > "${ELEMENT_PATH}" + +# Check element is in temp file +if [ "$(xmlstarlet sel -t -i "/wdef:knowledge/wdef:element[@wdef:id = '${ELEMENT_QID}']" -v "'true'" "${ELEMENT_PATH}")" != 'true' ] +then + echo "Element not available in wdef. Exiting." >&2 + exit +fi + +# Cache RDF +readonly RDF_PATH=$(cacheRDFMaxAge "${ELEMENT_QID}" "${RDFS_MAX_AGE}") + +# Merge and return canonicalized result +xmlstarlet tr "${MERGE_XSLT_PATH}" -s action=reduce \ + -s "rdf-path=${RDF_PATH}" \ + "${WDEF_PATH}" | xmlstarlet tr ${CANONICALIZE_XSLT_PATH} - + +rm "${ELEMENT_PATH}" diff --git a/scripts/rdf.sh b/scripts/rdf.sh new file mode 100644 index 0000000..7679251 --- /dev/null +++ b/scripts/rdf.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# rdf.sh - Set of Bash functions to work with wdef files. +# Copyright (C) 2022 Pierre Choffet +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of version 3 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +set -euo pipefail + +readonly RDFS_CACHE_DIR=${CACHE_DIR:-"${HOME}/.cache/wdef_tools/rdfs/"} + +# Get RDF and return a path to the result into cache dir +# Parameter: +# $1: Element QID +# Output: +# Path to the file containing the RDF, in cache dir +function cacheRDF() { + local -r element_qid="${1}" + + local -r rdf_url="https://www.wikidata.org/wiki/Special:EntityData/${element_qid}.rdf" + local -r rdf_path="${RDFS_CACHE_DIR}${element_qid}.xml" + + # Create cache dir + mkdir -p "${RDFS_CACHE_DIR}" + + curl "${rdf_url}" > "${rdf_path}" + + echo "${rdf_path}" +} + +# Ensure RDF cache is not older than given age +# Parameter: +# $1: Element QID +# $2: Max age (in minutes) +# Output: +# Path to the file containing the RDF, in cache dir +function cacheRDFMaxAge() { + local -r element_qid="${1}" + local -r max_age="${2}" + + local -r rdf_path="${RDFS_CACHE_DIR}${element_qid}.xml" + + if [ ! -f "${rdf_path}" ]||[[ $(find "${rdf_path}" -mmin "+${max_age}") ]] + then + cacheRDF "${element_qid}" + else + echo "${rdf_path}" + fi +} diff --git a/xslts/merge_rdf.xslt b/xslts/merge_rdf.xslt new file mode 100644 index 0000000..bb37306 --- /dev/null +++ b/xslts/merge_rdf.xslt @@ -0,0 +1,373 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + "reduce" is the only available action for now. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Cannot deal with wdef:somevalue for now + + + + + + + + + + + + + + + + + + + + + + + no + + + yes + + + + + + + + + + + + + + + yes + + + no + + + + + + + + + + + + + yes + + + no + + + + + + + + yes + + + no + + + + + + + yes + + + no + + + + + + + + + no + + + + + + + + + + + + + yes + + + no + + + + + + + + + + + + no + + + + + + + + + + + + + + + Can only deal with precision between 9 and 11 + + + + + + + + + + + + + + + + Can only deal with precision between 9 and 11 + + + + + + + yes + + + no + + + + + + + yes + + + no + + + + + + Can only deal with gregorian calendar for now + + + + + + yes + + + + WD has time data but incompatible or less precise. We cannot deal with that for now. + + no + + + + + + + + + + no + cannot deal with more than one value for now + + + no + + + + + + + + + no + + + yes + + + + + + + + + + + + + + no + + + yes + + + + + + + + + + + + + + + + + + + + +