From ffe706065e244158de3f31b7627bcd0f0b86fb54 Mon Sep 17 00:00:00 2001 From: Pierre Choffet Date: Thu, 19 Mar 2026 11:12:42 -0400 Subject: [PATCH] Add import tools --- README | 8 +- scripts/import.sh | 168 +++++++++++++++++++++++++++ xslts/get_next_step.xslt | 244 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 419 insertions(+), 1 deletion(-) create mode 100755 scripts/import.sh create mode 100644 xslts/get_next_step.xslt diff --git a/README b/README index 6bf2fb6..1c90c2c 100644 --- a/README +++ b/README @@ -10,6 +10,9 @@ applying on non-compliant file is undefined. Runtime dependencies are: - Bash - https://www.gnu.org/software/bash/ Shell script interpreter + + - jq - https://jqlang.org/ + JSON parser - Wikibase-cli - https://github.com/maxlath/wikibase-cli Interface to Wikidata operations @@ -25,6 +28,9 @@ Description of the provided tools: - scripts/get_qid_from_property.sh Search Wikidata elements based on a value, return its QID when found. + - scripts/import.sh + Import a wdef file into Wikidata. + - xslts/canonicalize.xslt Return a wdef under its normal form. @@ -38,4 +44,4 @@ Description of the provided tools: Change the wdef:id of an element and its references. - xslts/report_label_description_duplicates.xslt - Detect elements with duplicate labels/descriptions (this is illegal in Wikidata) + Detect elements with duplicate labels/descriptions (this is illegal in Wikidata). diff --git a/scripts/import.sh b/scripts/import.sh new file mode 100755 index 0000000..778c2f8 --- /dev/null +++ b/scripts/import.sh @@ -0,0 +1,168 @@ +#!/bin/bash + +# import.sh - Import data wdef into Wikidata. +# Copyright (C) 2020, 2021, 2023 Pierre Choffet +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of version 3 of the GNU General Public License as published +# by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +readonly SCRIPT_DIR="$(dirname "$0")" + +set -euo pipefail + +readonly GET_NEXT_STEP_XSLT_PATH="${SCRIPT_DIR}/../xslts/get_next_step.xslt" +readonly REPLACE_ID_XSLT_PATH="${SCRIPT_DIR}/../xslts/replace_id.xslt" +readonly REMOVE_LABELS_DESCRIPTIONS_XSLT_PATH="${SCRIPT_DIR}/../xslts/remove_labels_descriptions.xslt" +readonly CANONICALIZE_XSLT_PATH="${SCRIPT_DIR}/../xslts/canonicalize.xslt" + +function usage() { + cat << EOF +USAGE: import.sh + +Import a wdef file into Wikidata. The new elements qids are appended into the +given "new_elements_qids_path". +EOF +} + +function createEntity { + local -r wdef_id="${1}" + local -r entity_json="${2}" + + echo "→ wd create-entity ${entity_json}" >&2 + sleep 2 + + # Create entity and get ID afterwards: + local -r element_id=$(wd create-entity "${entity_json}" | jq -r .entity.id) + + echo "${wdef_id} = ${element_id}" >> "${NEW_ELEMENTS_LOG_PATH}" + + # Replace id + local -r new_id_xml=$(mktemp) + xmlstarlet tr "${REPLACE_ID_XSLT_PATH}" -s old-id="${wdef_id}" -s new-id="${element_id}" "${WDEF_PATH}" > "${new_id_xml}" + + # Remove labels and descriptions + local -r reduced_xml=$(mktemp) + xmlstarlet tr "${REMOVE_LABELS_DESCRIPTIONS_XSLT_PATH}" -s wdef-id="${element_id}" "${new_id_xml}" > "${reduced_xml}" + rm "${new_id_xml}" + + echo "${reduced_xml}" +} + +function addClaim { + local -r wdef_id="${1}" + local -r wd_id="${2}" + local -r wd_pid="${3}" + local -r value="${4}" + + echo "→ wd add-claim ${wd_id} ${wd_pid} ${value}" >&2 + sleep 1 + + # Create claim and get ID afterwards: + local -r claim_id=$(wd add-claim "${wd_id}" "${wd_pid}" "${value}" | jq -r .claim.id) + + local -r reduced_xml=$(mktemp) + xmlstarlet tr "${REPLACE_ID_XSLT_PATH}" -s old-id="${wdef_id}" -s new-id="${claim_id}" "${WDEF_PATH}" > "${reduced_xml}" + + echo "${reduced_xml}" +} + +function addQualifier { + local -r qualifier_value_id="${1}" + local -r value_id="${2}" + local -r wd_pid="${3}" + local -r value="${4}" + + echo "→ wd add-qualifier ${value_id} ${wd_pid} ${value}" >&2 + sleep 1 + + # Create qualifier and get ID afterwards: + local -r qualifier_id=$(wd add-qualifier "${value_id}" "${wd_pid}" "${value}" | jq -r .claim.qualifiers.${wd_pid}[].hash) + + local -r reduced_xml=$(mktemp) + xmlstarlet tr "${REPLACE_ID_XSLT_PATH}" -s old-id="${qualifier_value_id}" -s new-id="${qualifier_id}" "${WDEF_PATH}" > "${reduced_xml}" + + echo "${reduced_xml}" +} + +if [ "$#" -ne 2 ] +then + usage >&2 + exit 1 +fi + +WDEF_PATH="${1}" +readonly NEW_ELEMENTS_LOG_PATH="${2}" + +# Check user parameters +if [ ! -s "${1}" ] +then + echo "${1} doesn't exist or is not readable." + exit 1 +fi + +# Track import generations +IMPORT_HISTORY_DIR="$(mktemp -d)/" +GENERATION=1 + +echo "New elements corresponding IDs will be append in ${NEW_ELEMENTS_LOG_PATH}" >&2 +echo "Import generations log will be in ${IMPORT_HISTORY_DIR}" >&2 + +# Prepare first step +cp "${WDEF_PATH}" "${IMPORT_HISTORY_DIR}${GENERATION}.xml" +WDEF_PATH="${IMPORT_HISTORY_DIR}${GENERATION}.xml" + +# Get first step +NEXT_STEP=$(head -1 <(xmlstarlet tr "${GET_NEXT_STEP_XSLT_PATH}" "${WDEF_PATH}")) + +while [ "${NEXT_STEP}" != '' ] +do + WB_COMMAND="${NEXT_STEP%% *}" + + case $WB_COMMAND in + create-entity) + NEW_XML_PATH=$(createEntity $(echo "${NEXT_STEP}" | cut -d ' ' -f2) "$(echo "${NEXT_STEP}" | cut -d ' ' -f3-)") + ;; + add-claim) + NEW_XML_PATH=$(addClaim $(echo "${NEXT_STEP}" | cut -d ' ' -f2) "$(echo "${NEXT_STEP}" | cut -d ' ' -f3)" "$(echo "${NEXT_STEP}" | cut -d ' ' -f4)" "$(echo "${NEXT_STEP}" | cut -d ' ' -f5-)") + ;; + add-claim-no-value) + NEW_XML_PATH=$(addClaim $(echo "${NEXT_STEP}" | cut -d ' ' -f2) "$(echo "${NEXT_STEP}" | cut -d ' ' -f3)" "$(echo "${NEXT_STEP}" | cut -d ' ' -f4)" '{"snaktype": "novalue"}') + ;; + add-qualifier) + NEW_XML_PATH=$(addQualifier $(echo "${NEXT_STEP}" | cut -d ' ' -f2) "$(echo "${NEXT_STEP}" | cut -d ' ' -f3)" "$(echo "${NEXT_STEP}" | cut -d ' ' -f4)" "$(echo "${NEXT_STEP}" | cut -d ' ' -f5)") + ;; + *) + echo "Unexpected \"${WB_COMMAND}\" command." + exit 1 + ;; + esac + + # Check returned string is path + if [ ! -s "${NEW_XML_PATH}" ] + then + exit 1 + fi + + # Delete last generation wdef file + rm "${WDEF_PATH}" + + # Generate new canonical version + CANONICALIZED_XML_PATH="$(mktemp)" + xmlstarlet tr "${CANONICALIZE_XSLT_PATH}" "${NEW_XML_PATH}" > "${CANONICALIZED_XML_PATH}" + rm "${NEW_XML_PATH}" + + # Prepare next step + GENERATION=$((GENERATION + 1)) + WDEF_PATH="${IMPORT_HISTORY_DIR}${GENERATION}.xml" + mv "${CANONICALIZED_XML_PATH}" "${WDEF_PATH}" + NEXT_STEP=$(head -1 <(xmlstarlet tr "${GET_NEXT_STEP_XSLT_PATH}" "${WDEF_PATH}")) +done diff --git a/xslts/get_next_step.xslt b/xslts/get_next_step.xslt new file mode 100644 index 0000000..b46e5ec --- /dev/null +++ b/xslts/get_next_step.xslt @@ -0,0 +1,244 @@ + + +]> + + + + + + + + + create-entity + + { + + "labels":{ + + } + + + , + + + "descriptions":{ + + } + + } + + + + + + + + + + " + + ": " + + " + + , + + + + + + + + + + " + + ": " + + " + + , + + + + + add-claim + + + + + + + + + + + + + {"amount": " + + ", "unit": " + + "} + + + + + + + {"time": " + + + + + + + + + + + + + Cannot import time with precision" + + " for now. Exiting. + + + + ", "precision": + + , "calendar": " + + "} + + + {"language": " + + ", "text": " + + "} + + + {"latitude": + + , "longitude": + + , "precision": + + , "globe": "http://www.wikidata.org/entity/ + + "} + + + + Cannot import " + + " type for now. Exiting. + + + + + + + + + add-claim-no-value + + + + + + + + + + add-qualifier + + + + + + + + + + + + {"amount": " + + ", "unit": " + + "} + + + + + + {"time": " + + ", "precision": " + + ", "calendar": " + + "} + + + + Cannot import " + + " type for now. Exiting. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + -- 2.53.0