From d67a30a1a4503be4e29b42e662bd1783b579f499 Mon Sep 17 00:00:00 2001 From: Pierre Choffet Date: Thu, 6 Jan 2022 18:04:46 -0500 Subject: [PATCH] Add get_qid_from_property This script returns the QID of an element equivalent to an entry in a wdef file, based on unique values in a property (typically external identifiers). query.sh is a collection of functions to be reused. --- README | 3 + scripts/get_qid_from_property.sh | 80 ++++++++++++++++++++++ scripts/query.sh | 113 +++++++++++++++++++++++++++++++ 3 files changed, 196 insertions(+) create mode 100755 scripts/get_qid_from_property.sh create mode 100644 scripts/query.sh diff --git a/README b/README index ba9f599..0368030 100644 --- a/README +++ b/README @@ -19,5 +19,8 @@ Runtime dependencies are: Description of the provided tools: + - scripts/get_qid_from_property.sh + Search Wikidata elements based on a value, return its QID when found. + - xslts/canonicalize.xslt Return a wdef under its normal form. diff --git a/scripts/get_qid_from_property.sh b/scripts/get_qid_from_property.sh new file mode 100755 index 0000000..374844b --- /dev/null +++ b/scripts/get_qid_from_property.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +# get_equivalent.sh - Search equivalent element on Wikidata. +# Copyright (C) 2022 Pierre Choffet +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of version 3 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +source "$(dirname "$0")/query.sh" + +set -euo pipefail + +function buildQuery() { + local -r pid="${1}" + shift + local -r values=( "${@}" ) + + echo 'SELECT DISTINCT ?element WHERE {' + + for index in ${!values[@]} + do + if [ "${index}" -ne 0 ] + then + echo 'UNION' + fi + echo "{ ?element p:${pid} ?id${index}. ?id${index} (ps:${pid}) ${values[$index]@Q}. }" + done + + echo '} LIMIT 2' +} + +function usage() { + cat << EOF +USAGE: get_qid_from_property.sh + +This script will get Wikidata's equivalent element to an entry in a wdef. If +uniqueness can be ensured from a literal value (typically an external identifier), +it's an easy way to merge your wdef with Wikidata. +EOF +} + +if [ "$#" -ne 3 ] +then + usage >&2 + exit 1 +fi + +readonly WDEF_PATH="${1}" +readonly ELEMENT_ID="${2}" +readonly ELEMENT_PID="${3}" + +# Get wdef +readarray -t VALUES<<<$(xmlstarlet sel -t -m "/wdef:knowledge/wdef:element[@wdef:id = \"${ELEMENT_ID}\"]/wdef:property[@wdef:pid = \"${ELEMENT_PID}\"]/wdef:value/wdef:literal" -v '.' -n "${WDEF_PATH}") + +# Build query +readonly QUERY_PATH=$(mktemp) +buildQuery "${ELEMENT_PID}" ${VALUES[@]} > "${QUERY_PATH}" +readonly RESULT_PATH=$(query "${QUERY_PATH}") + +# Cleanup +rm "${QUERY_PATH}" + +# Print potential result in stdout, cleanup, exit +readonly QID=$(xmlstarlet sel -t -m '/_:sparql/_:results[count(_:result) = 1]' -v '_:result/_:binding[@name = "element"]/_:uri' "${RESULT_PATH}") + +if [ "${QID}" != '' ] +then + echo "${QID##*/}" +else + exit 1 +fi diff --git a/scripts/query.sh b/scripts/query.sh new file mode 100644 index 0000000..d04c83d --- /dev/null +++ b/scripts/query.sh @@ -0,0 +1,113 @@ +#!/bin/bash + +# query.sh - Set of Bash functions to work with wdef files. +# Copyright (C) 2022 Pierre Choffet +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of version 3 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +set -euo pipefail + +readonly WIKIDATA_SPARQL_ENDPOINT='https://query.wikidata.org/bigdata/namespace/wdq/sparql' + +readonly CACHE_DIR=${CACHE_DIR:-"${HOME}/.cache/wdef_tools/"} +readonly QUERY_CACHE_DIR="${CACHE_DIR}/queries/" + +# Send query and return a path to the result into cache dir +# Parameter: +# $1: Path to file containing query +# Output: +# Path to the file containing result, in cache dir +function query() { + local -r query_path="${1}" + + if [ ! -s "${query_path}" ] + then + echo "Query not found at path ${query_path}. Exiting." >&2 + return 1 + fi + + local -r query_checksum=$(md5sum "${query_path}" | cut -d ' ' -f 1) + local -r result_path="${QUERY_CACHE_DIR}/${query_checksum}" + + # Prepare query + local -r query=$(cat "${query_path}" | jq -sRr @uri) + + # Send query, cache result + mkdir -p "${QUERY_CACHE_DIR}" + wget -O "${result_path}" "${WIKIDATA_SPARQL_ENDPOINT}?query=${query}" + + echo "${result_path}" +} + +# Build query from template and a set of variables +# Params: +# $1: Path to file containing query template +# $2: Associative array with variables values, indexed on their names +# Output: +# Path to the file containing result, in cache dir +function queryVariables() { + local -r template_path="${1}" + local -nr variables=${2} + + # Check parameters + if [ ! -s "${template_path}" ] + then + echo "Query template not found at path ${template_path}. Exiting." >&2 + return 1 + fi + if [ ${#variables[@]} -eq 0 ] + then + echo "Missing query variables. Exiting." >&2 + return 1 + fi + + local -r template_checksum=$(md5sum "${template_path}" | cut -d ' ' -f 1) + local -r variables_checksum=$(echo "$(typeset -p ${2})" | md5sum - | cut -d ' ' -f 1) + + # Build query + local query=$(cat "${template_path}") + for var_name in ${!variables[@]} + do + query=${query//\%${var_name}\%/${variables[${var_name}]}} + done + + # Save query into a temporary file + local -r query_path="$(mktemp)" + echo "${query}" > "${query_path}" + + # Check at least one variable has been substituted (file name collision otherwise) + if [ "$(md5sum "${query_path}" | cut -d ' ' -f 1)" == "${template_checksum}" ] + then + echo "No variable substituted. Exiting." >&2 + return 1 + fi + + # Send query + local -r query_result_path=$(query "${query_path}") + + # Generate query result cache path, create dir if needed + local -r query_cache_path="${QUERY_CACHE_DIR}/${template_checksum}/${variables_checksum}" + mkdir -p "$(dirname "${query_cache_path}")" + + # Link result + if [ ! -f "${query_cache_path}" ] + then + ln "${query_result_path}" "${query_cache_path}" + fi + + # Remove query + rm "${query_path}" + + # Return result path + echo "${query_cache_path}" +} -- 2.47.0